ext: add McPAT source
authorAnthony Gutierrez <atgutier@umich.edu>
Tue, 1 Apr 2014 16:44:30 +0000 (12:44 -0400)
committerAnthony Gutierrez <atgutier@umich.edu>
Tue, 1 Apr 2014 16:44:30 +0000 (12:44 -0400)
this patch adds the source for mcpat, a power, area, and timing modeling
framework.

104 files changed:
ext/mcpat/ARM_A9.xml [new file with mode: 0644]
ext/mcpat/ARM_A9_2000.xml [new file with mode: 0644]
ext/mcpat/ARM_A9_800.xml [new file with mode: 0644]
ext/mcpat/Alpha21364.xml [new file with mode: 0644]
ext/mcpat/Niagara1.xml [new file with mode: 0644]
ext/mcpat/Niagara1_sharing.xml [new file with mode: 0644]
ext/mcpat/Niagara1_sharing_DC.xml [new file with mode: 0644]
ext/mcpat/Niagara1_sharing_SBT.xml [new file with mode: 0644]
ext/mcpat/Niagara1_sharing_ST.xml [new file with mode: 0644]
ext/mcpat/Niagara2.xml [new file with mode: 0644]
ext/mcpat/Penryn.xml [new file with mode: 0644]
ext/mcpat/README [new file with mode: 0644]
ext/mcpat/XML_Parse.cc [new file with mode: 0644]
ext/mcpat/XML_Parse.h [new file with mode: 0644]
ext/mcpat/Xeon.xml [new file with mode: 0644]
ext/mcpat/arch_const.h [new file with mode: 0644]
ext/mcpat/array.cc [new file with mode: 0644]
ext/mcpat/array.h [new file with mode: 0644]
ext/mcpat/basic_components.cc [new file with mode: 0644]
ext/mcpat/basic_components.h [new file with mode: 0644]
ext/mcpat/cacti/README [new file with mode: 0644]
ext/mcpat/cacti/Ucache.cc [new file with mode: 0644]
ext/mcpat/cacti/Ucache.h [new file with mode: 0644]
ext/mcpat/cacti/arbiter.cc [new file with mode: 0644]
ext/mcpat/cacti/arbiter.h [new file with mode: 0644]
ext/mcpat/cacti/area.cc [new file with mode: 0644]
ext/mcpat/cacti/area.h [new file with mode: 0644]
ext/mcpat/cacti/bank.cc [new file with mode: 0755]
ext/mcpat/cacti/bank.h [new file with mode: 0755]
ext/mcpat/cacti/basic_circuit.cc [new file with mode: 0644]
ext/mcpat/cacti/basic_circuit.h [new file with mode: 0644]
ext/mcpat/cacti/batch_tests [new file with mode: 0755]
ext/mcpat/cacti/cache.cfg [new file with mode: 0755]
ext/mcpat/cacti/cacti.i [new file with mode: 0644]
ext/mcpat/cacti/cacti.mk [new file with mode: 0644]
ext/mcpat/cacti/cacti_interface.cc [new file with mode: 0644]
ext/mcpat/cacti/cacti_interface.h [new file with mode: 0644]
ext/mcpat/cacti/component.cc [new file with mode: 0644]
ext/mcpat/cacti/component.h [new file with mode: 0644]
ext/mcpat/cacti/const.h [new file with mode: 0644]
ext/mcpat/cacti/contention.dat [new file with mode: 0755]
ext/mcpat/cacti/crossbar.cc [new file with mode: 0644]
ext/mcpat/cacti/crossbar.h [new file with mode: 0644]
ext/mcpat/cacti/decoder.cc [new file with mode: 0644]
ext/mcpat/cacti/decoder.h [new file with mode: 0644]
ext/mcpat/cacti/htree2.cc [new file with mode: 0644]
ext/mcpat/cacti/htree2.h [new file with mode: 0644]
ext/mcpat/cacti/io.cc [new file with mode: 0644]
ext/mcpat/cacti/io.h [new file with mode: 0644]
ext/mcpat/cacti/main.cc [new file with mode: 0644]
ext/mcpat/cacti/makefile [new file with mode: 0644]
ext/mcpat/cacti/mat.cc [new file with mode: 0755]
ext/mcpat/cacti/mat.h [new file with mode: 0755]
ext/mcpat/cacti/nuca.cc [new file with mode: 0644]
ext/mcpat/cacti/nuca.h [new file with mode: 0644]
ext/mcpat/cacti/parameter.cc [new file with mode: 0644]
ext/mcpat/cacti/parameter.h [new file with mode: 0644]
ext/mcpat/cacti/router.cc [new file with mode: 0644]
ext/mcpat/cacti/router.h [new file with mode: 0644]
ext/mcpat/cacti/subarray.cc [new file with mode: 0755]
ext/mcpat/cacti/subarray.h [new file with mode: 0755]
ext/mcpat/cacti/technology.cc [new file with mode: 0644]
ext/mcpat/cacti/uca.cc [new file with mode: 0755]
ext/mcpat/cacti/uca.h [new file with mode: 0755]
ext/mcpat/cacti/wire.cc [new file with mode: 0644]
ext/mcpat/cacti/wire.h [new file with mode: 0644]
ext/mcpat/core.cc [new file with mode: 0644]
ext/mcpat/core.h [new file with mode: 0644]
ext/mcpat/globalvar.h [new file with mode: 0644]
ext/mcpat/interconnect.cc [new file with mode: 0644]
ext/mcpat/interconnect.h [new file with mode: 0644]
ext/mcpat/iocontrollers.cc [new file with mode: 0644]
ext/mcpat/iocontrollers.h [new file with mode: 0644]
ext/mcpat/logic.cc [new file with mode: 0644]
ext/mcpat/logic.h [new file with mode: 0644]
ext/mcpat/main.cc [new file with mode: 0644]
ext/mcpat/makefile [new file with mode: 0644]
ext/mcpat/mcpat.mk [new file with mode: 0644]
ext/mcpat/mcpatXeonCore.mk [new file with mode: 0644]
ext/mcpat/memoryctrl.cc [new file with mode: 0644]
ext/mcpat/memoryctrl.h [new file with mode: 0644]
ext/mcpat/noc.cc [new file with mode: 0644]
ext/mcpat/noc.h [new file with mode: 0644]
ext/mcpat/processor.cc [new file with mode: 0644]
ext/mcpat/processor.h [new file with mode: 0644]
ext/mcpat/results/A9_2000 [new file with mode: 0644]
ext/mcpat/results/A9_2000_withIOC [new file with mode: 0644]
ext/mcpat/results/A9_800 [new file with mode: 0644]
ext/mcpat/results/Alpha21364 [new file with mode: 0644]
ext/mcpat/results/Alpha21364_90nm [new file with mode: 0644]
ext/mcpat/results/Penryn [new file with mode: 0644]
ext/mcpat/results/T1 [new file with mode: 0644]
ext/mcpat/results/T1_DC_64 [new file with mode: 0644]
ext/mcpat/results/T1_SBT_64 [new file with mode: 0644]
ext/mcpat/results/T1_ST_64 [new file with mode: 0644]
ext/mcpat/results/T2 [new file with mode: 0644]
ext/mcpat/results/Xeon_core [new file with mode: 0644]
ext/mcpat/results/Xeon_uncore [new file with mode: 0644]
ext/mcpat/sharedcache.cc [new file with mode: 0644]
ext/mcpat/sharedcache.h [new file with mode: 0644]
ext/mcpat/technology_xeon_core.cc [new file with mode: 0644]
ext/mcpat/version.h [new file with mode: 0644]
ext/mcpat/xmlParser.cc [new file with mode: 0644]
ext/mcpat/xmlParser.h [new file with mode: 0644]

diff --git a/ext/mcpat/ARM_A9.xml b/ext/mcpat/ARM_A9.xml
new file mode 100644 (file)
index 0000000..9289b66
--- /dev/null
@@ -0,0 +1,415 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="2"/>
+               <param name="number_of_L1Directories" value="2"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="40"/><!-- nm -->
+               <param name="target_core_clockrate" value="2000"/><!--MHz -->
+               <param name="temperature" value="380"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="1"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+               <param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
+               <param name="machine_bits" value="32"/>
+               <param name="virtual_address_width" value="32"/>
+               <param name="physical_address_width" value="32"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="2000"/>
+                       <!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+                       <param name="opt_local" value="1"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="7"/>
+                       <param name="x86" value="0"/>
+                       <param name="micro_opcode_width" value="8"/>
+                       <param name="machine_type" value="0"/>
+                       <!-- inorder/OoO; 1 inorder; 0 OOO-->
+                       <param name="number_hardware_threads" value="1"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="2"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="2"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="4"/>
+                       <param name="peak_issue_width" value="7"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="4"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="1"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="8,8"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="3"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="1"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="20"/>
+                       <param name="fp_instruction_window_size" value="15"/>
+                       <!-- Numbers need to be confirmed -->
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="0"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="64"/>
+                       <param name="phy_Regs_FRF_size" value="64"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="0"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="4"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="0"/>      
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="400000"/>
+                       <stat name="int_instructions" value="200000"/>
+                       <stat name="fp_instructions" value="100000"/>
+                       <stat name="branch_instructions" value="100000"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="0"/>
+                       <stat name="store_instructions" value="50000"/>
+                       <stat name="committed_instructions" value="400000"/>
+                       <stat name="committed_int_instructions" value="200000"/>
+                       <stat name="committed_fp_instructions" value="100000"/>
+                       <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="400000"/>
+                       <stat name="ROB_writes" value="400000"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+                       <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+                       <stat name="fp_rename_reads" value="200000"/>
+                       <stat name="fp_rename_writes" value="100000"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="400000"/>
+                       <stat name="inst_window_writes" value="400000"/>
+                       <stat name="inst_window_wakeup_accesses" value="800000"/>
+                       <stat name="fp_inst_window_reads" value="200000"/>
+                       <stat name="fp_inst_window_writes" value="200000"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="600000"/>
+                       <stat name="float_regfile_reads" value="100000"/>
+                       <stat name="int_regfile_writes" value="300000"/>
+                       <stat name="float_regfile_writes" value="50000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="300000"/>                     
+                       <stat name="fpu_accesses" value="100000"/>
+                       <stat name="mul_accesses" value="200000"/>
+                       <stat name="cdb_alu_accesses" value="300000"/>
+                       <stat name="cdb_mul_accesses" value="200000"/>
+                       <stat name="cdb_fpu_accesses" value="100000"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="1"/>                 
+                       <stat name="LSU_duty_cycle" value="0.5"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.5"/>
+                       <stat name="ALU_duty_cycle" value="1"/>
+                       <stat name="MUL_duty_cycle" value="0.3"/>
+                       <stat name="FPU_duty_cycle" value="0.3"/>
+                       <stat name="ALU_cdb_duty_cycle" value="1"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.3"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.3"/>
+                       <param name="number_of_BPT" value="2"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="4, 4, 4,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/><!--dual threads-->
+                               <stat name="total_accesses" value="400000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
+                               <param name="buffer_sizes" value="4, 4, 4, 4"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <param name="number_of_BTB" value="2"/>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="2048,4,2, 2, 1,3"/> <!--should be 4096 + 1024 -->
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+                               <stat name="write_accesses" value="0"/>
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="2000"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                           <stat name="duty_cycle" value="0.1"/>
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="2000"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>    
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="2000"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1.0"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="800"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="11824"/>
+                               <stat name="write_accesses" value="11276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                               <stat name="duty_cycle" value="1.0"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="2000"/>
+                       <param name="type" value="0"/>
+                       <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+                               at each time only one node can send req -->
+                       <param name="horizontal_nodes" value="1"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="1"/>
+                       <param name="output_ports" value="1"/>
+                       <!-- For bus the I/O ports should be 1 -->
+                       <param name="flit_bits" value="128"/>
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+                               chip_coverage <=1 -->
+                       <param name="link_routing_over_percentage" value="0.5"/>
+                       <!-- Links can route over other components or occupy whole area.
+                               by default, 50% of the NoC global links routes over other 
+                               components -->
+                       <stat name="total_accesses" value="100000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="1"/>
+               </component>            
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="mc_clock" value="400"/><!--MHz-->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="llc_line_length" value="64"/><!--B-->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="66666"/>
+                       <stat name="memory_reads" value="33333"/>
+                       <stat name="memory_writes" value="33333"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+       </component>
+</component>
diff --git a/ext/mcpat/ARM_A9_2000.xml b/ext/mcpat/ARM_A9_2000.xml
new file mode 100644 (file)
index 0000000..c040e1b
--- /dev/null
@@ -0,0 +1,463 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <!--Duty cycles in this file are set according to "ARM MPcore
+                       ARchitecture performance Enhancement" in MPF Japan 2008 -->
+               <param name="number_of_cores" value="2"/>
+               <param name="number_of_L1Directories" value="2"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="22"/><!-- nm -->
+               <param name="target_core_clockrate" value="2000"/><!--MHz -->
+               <param name="temperature" value="340"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+               <param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
+               <param name="opt_clockrate" value="1"/>
+               <param name="machine_bits" value="32"/>
+               <param name="virtual_address_width" value="32"/>
+               <param name="physical_address_width" value="32"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="2000"/>
+                       <!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+                       <param name="opt_local" value="1"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="7"/>
+                       <param name="x86" value="0"/>
+                       <param name="micro_opcode_width" value="8"/>
+                       <param name="machine_type" value="0"/>
+                       <!-- inorder/OoO; 1 inorder; 0 OOO-->
+                       <param name="number_hardware_threads" value="1"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="2"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="2"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="4"/>
+                       <param name="peak_issue_width" value="7"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="4"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="1"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="8,8"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="3"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="1"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="20"/>
+                       <param name="fp_instruction_window_size" value="15"/>
+                       <!-- Numbers need to be confirmed -->
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="0"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="64"/>
+                       <param name="phy_Regs_FRF_size" value="64"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="0"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="4"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="0"/>      
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="4"/>                                              
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="400000"/>
+                       <stat name="int_instructions" value="200000"/>
+                       <stat name="fp_instructions" value="100000"/>
+                       <stat name="branch_instructions" value="100000"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="0"/>
+                       <stat name="store_instructions" value="50000"/>
+                       <stat name="committed_instructions" value="400000"/>
+                       <stat name="committed_int_instructions" value="200000"/>
+                       <stat name="committed_fp_instructions" value="100000"/>
+                       <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="400000"/>
+                       <stat name="ROB_writes" value="400000"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+                       <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+                       <stat name="fp_rename_reads" value="200000"/>
+                       <stat name="fp_rename_writes" value="100000"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="400000"/>
+                       <stat name="inst_window_writes" value="400000"/>
+                       <stat name="inst_window_wakeup_accesses" value="800000"/>
+                       <stat name="fp_inst_window_reads" value="200000"/>
+                       <stat name="fp_inst_window_writes" value="200000"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="600000"/>
+                       <stat name="float_regfile_reads" value="100000"/>
+                       <stat name="int_regfile_writes" value="300000"/>
+                       <stat name="float_regfile_writes" value="50000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="300000"/>                     
+                       <stat name="fpu_accesses" value="100000"/>
+                       <stat name="mul_accesses" value="200000"/>
+                       <stat name="cdb_alu_accesses" value="300000"/>
+                       <stat name="cdb_mul_accesses" value="200000"/>
+                       <stat name="cdb_fpu_accesses" value="100000"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.9"/>
+                       <stat name="BR_duty_cycle" value="0.72"/><!--branch-->                  
+                       <stat name="LSU_duty_cycle" value="0.71"/>
+                       <stat name="MemManU_I_duty_cycle" value="0.9"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.71"/>
+                       <stat name="ALU_duty_cycle" value="0.76"/>
+                       <!-- (.78*2+.71)/3 -->
+                       <stat name="MUL_duty_cycle" value="0.82"/>
+                       <stat name="FPU_duty_cycle" value="0.0"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.76"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.82"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.0"/>
+                       <param name="number_of_BPT" value="2"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="4"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="4, 4, 4,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/><!--dual threads-->
+                               <stat name="total_accesses" value="400000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
+                               <param name="buffer_sizes" value="4, 4, 4, 4"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <param name="number_of_BTB" value="2"/>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="4096,4,2, 2, 1,1"/> 
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+                               <stat name="write_accesses" value="0"/>
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="2000"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="2"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                           <stat name="duty_cycle" value="0.1"/>
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.1"/>       
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1.0"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="800"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="11824"/>
+                               <stat name="write_accesses" value="11276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                               <stat name="duty_cycle" value="1.0"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="2000"/>
+                       <param name="type" value="0"/>
+                       <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+                               at each time only one node can send req -->
+                       <param name="horizontal_nodes" value="1"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="1"/>
+                       <param name="output_ports" value="1"/>
+                       <!-- For bus the I/O ports should be 1 -->
+                       <param name="flit_bits" value="64"/>
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+                               chip_coverage <=1 -->
+                       <param name="link_routing_over_percentage" value="0.5"/>
+                       <!-- Links can route over other components or occupy whole area.
+                               by default, 50% of the NoC global links routes over other 
+                               components -->
+                       <stat name="total_accesses" value="100000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.2"/>
+               </component>            
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="400"/><!--MHz-->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
+                       <param name="number_mcs" value="1"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="66666"/>
+                       <stat name="memory_reads" value="33333"/>
+                       <stat name="memory_writes" value="33333"/>
+                       <param name="withPHY" value="1"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="1"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="1"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="1"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/ARM_A9_800.xml b/ext/mcpat/ARM_A9_800.xml
new file mode 100644 (file)
index 0000000..fd7b214
--- /dev/null
@@ -0,0 +1,463 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <!--Duty cycles in this file are set according to "ARM MPcore
+                       ARchitecture performance Enhancement" in MPF Japan 2008 -->
+               <param name="number_of_cores" value="2"/>
+               <param name="number_of_L1Directories" value="2"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="32"/><!-- nm -->
+               <param name="target_core_clockrate" value="800"/><!--MHz -->
+               <param name="temperature" value="340"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+               <param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
+               <param name="opt_clockrate" value="0"/>
+               <param name="machine_bits" value="32"/>
+               <param name="virtual_address_width" value="32"/>
+               <param name="physical_address_width" value="32"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="800"/>
+                       <!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+                       <param name="opt_local" value="1"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="7"/>
+                       <param name="x86" value="0"/>
+                       <param name="micro_opcode_width" value="8"/>
+                       <param name="machine_type" value="0"/>
+                       <!-- inorder/OoO; 1 inorder; 0 OOO-->
+                       <param name="number_hardware_threads" value="1"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="2"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="2"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="4"/>
+                       <param name="peak_issue_width" value="7"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="4"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="1"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="8,8"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="3"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="1"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="20"/>
+                       <param name="fp_instruction_window_size" value="15"/>
+                       <!-- Numbers need to be confirmed -->
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="0"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="64"/>
+                       <param name="phy_Regs_FRF_size" value="64"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="0"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="4"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="0"/>      
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="4"/>                                              
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="400000"/>
+                       <stat name="int_instructions" value="200000"/>
+                       <stat name="fp_instructions" value="100000"/>
+                       <stat name="branch_instructions" value="100000"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="0"/>
+                       <stat name="store_instructions" value="50000"/>
+                       <stat name="committed_instructions" value="400000"/>
+                       <stat name="committed_int_instructions" value="200000"/>
+                       <stat name="committed_fp_instructions" value="100000"/>
+                       <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="400000"/>
+                       <stat name="ROB_writes" value="400000"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+                       <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+                       <stat name="fp_rename_reads" value="200000"/>
+                       <stat name="fp_rename_writes" value="100000"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="400000"/>
+                       <stat name="inst_window_writes" value="400000"/>
+                       <stat name="inst_window_wakeup_accesses" value="800000"/>
+                       <stat name="fp_inst_window_reads" value="200000"/>
+                       <stat name="fp_inst_window_writes" value="200000"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="600000"/>
+                       <stat name="float_regfile_reads" value="100000"/>
+                       <stat name="int_regfile_writes" value="300000"/>
+                       <stat name="float_regfile_writes" value="50000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="300000"/>                     
+                       <stat name="fpu_accesses" value="100000"/>
+                       <stat name="mul_accesses" value="200000"/>
+                       <stat name="cdb_alu_accesses" value="300000"/>
+                       <stat name="cdb_mul_accesses" value="200000"/>
+                       <stat name="cdb_fpu_accesses" value="100000"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.9"/>
+                       <stat name="BR_duty_cycle" value="0.72"/><!--branch-->                  
+                       <stat name="LSU_duty_cycle" value="0.71"/>
+                       <stat name="MemManU_I_duty_cycle" value="0.9"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.71"/>
+                       <stat name="ALU_duty_cycle" value="0.76"/>
+                       <!-- (.78*2+.71)/3 -->
+                       <stat name="MUL_duty_cycle" value="0.82"/>
+                       <stat name="FPU_duty_cycle" value="0.0"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.76"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.82"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.0"/>
+                       <param name="number_of_BPT" value="2"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="4"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="4, 4, 4,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/><!--dual threads-->
+                               <stat name="total_accesses" value="400000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
+                               <param name="buffer_sizes" value="4, 4, 4, 4"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <param name="number_of_BTB" value="2"/>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="4096,4,2, 2, 1,1"/> 
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+                               <stat name="write_accesses" value="0"/>
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="800"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="2"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                           <stat name="duty_cycle" value="0.1"/>
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.1"/>       
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1.0"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="800"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="11824"/>
+                               <stat name="write_accesses" value="11276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                               <stat name="duty_cycle" value="1.0"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="800"/>
+                       <param name="type" value="0"/>
+                       <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+                               at each time only one node can send req -->
+                       <param name="horizontal_nodes" value="1"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="1"/>
+                       <param name="output_ports" value="1"/>
+                       <!-- For bus the I/O ports should be 1 -->
+                       <param name="flit_bits" value="64"/>
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+                               chip_coverage <=1 -->
+                       <param name="link_routing_over_percentage" value="0.5"/>
+                       <!-- Links can route over other components or occupy whole area.
+                               by default, 50% of the NoC global links routes over other 
+                               components -->
+                       <stat name="total_accesses" value="100000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.2"/>
+               </component>            
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="400"/><!--MHz-->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="66666"/>
+                       <stat name="memory_reads" value="33333"/>
+                       <stat name="memory_writes" value="33333"/>
+                       <param name="withPHY" value="1"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/Alpha21364.xml b/ext/mcpat/Alpha21364.xml
new file mode 100644 (file)
index 0000000..c40c4f5
--- /dev/null
@@ -0,0 +1,456 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="1"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="1"/>
+               <param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="90"/><!-- nm -->
+               <param name="target_core_clockrate" value="1200"/><!--MHz -->
+               <param name="temperature" value="380"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="0"/><!-- 0 no use; 1 use when approperiate -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="1200"/>
+                       <!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+                       <param name="opt_local" value="1"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="7"/>
+                       <param name="x86" value="0"/>
+                       <param name="micro_opcode_width" value="8"/>
+                       <param name="machine_type" value="0"/>
+                       <!-- inorder/OoO; 1 inorder; 0 OOO-->
+                       <param name="number_hardware_threads" value="1"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="4"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="4"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="4"/>
+                       <param name="peak_issue_width" value="6"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="4"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="2"/>
+                       <param name="prediction_width" value="1"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="7,7"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="4"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="0"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="1"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="20"/>
+                       <param name="fp_instruction_window_size" value="15"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>          
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="72"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="1"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="0"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="32"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="32"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="2"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="400000"/>
+                       <stat name="int_instructions" value="200000"/>
+                       <stat name="fp_instructions" value="100000"/>
+                       <stat name="branch_instructions" value="100000"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="0"/>
+                       <stat name="store_instructions" value="50000"/>
+                       <stat name="committed_instructions" value="400000"/>
+                       <stat name="committed_int_instructions" value="200000"/>
+                       <stat name="committed_fp_instructions" value="100000"/>
+                       <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="400000"/>
+                       <stat name="ROB_writes" value="400000"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+                       <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+                       <stat name="fp_rename_reads" value="200000"/>
+                       <stat name="fp_rename_writes" value="100000"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="400000"/>
+                       <stat name="inst_window_writes" value="400000"/>
+                       <stat name="inst_window_wakeup_accesses" value="800000"/>
+                       <stat name="fp_inst_window_reads" value="200000"/>
+                       <stat name="fp_inst_window_writes" value="200000"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="600000"/>
+                       <stat name="float_regfile_reads" value="100000"/>
+                       <stat name="int_regfile_writes" value="300000"/>
+                       <stat name="float_regfile_writes" value="50000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="300000"/>                     
+                       <stat name="fpu_accesses" value="100000"/>
+                       <stat name="mul_accesses" value="200000"/>
+                       <stat name="cdb_alu_accesses" value="300000"/>
+                       <stat name="cdb_mul_accesses" value="200000"/>
+                       <stat name="cdb_fpu_accesses" value="100000"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="1"/>                 
+                       <stat name="LSU_duty_cycle" value="1"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="1"/>
+                       <stat name="ALU_duty_cycle" value="1"/>
+                       <stat name="MUL_duty_cycle" value="0.3"/>
+                       <stat name="FPU_duty_cycle" value="1"/>
+                       <stat name="ALU_cdb_duty_cycle" value="1"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.3"/>
+                       <stat name="FPU_cdb_duty_cycle" value="1"/>
+                       <param name="number_of_BPT" value="2"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="128"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="65536,16,2,1,1,2,16,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="128"/><!--dual threads-->
+                               <stat name="total_accesses" value="400000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="65536,16,2,1,1,3,16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <param name="number_of_BTB" value="2"/>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="6144,4,2,1, 1,3"/> <!--48Kbits -->
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+                               <stat name="write_accesses" value="0"/>
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="512,4,0,1,1, 1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="16, 16, 16, 16"/> 
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="1200"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>    
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1835008,16, 8, 16, 32, 32, 12, 1"/> 
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="1200"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1.0"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="850"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="11824"/>
+                               <stat name="write_accesses" value="11276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                               <stat name="duty_cycle" value="1.0"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="1200"/>
+                       <param name="type" value="1"/>
+                       <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+                               at each time only one node can send req -->
+                       <param name="horizontal_nodes" value="1"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="1"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="8"/>
+                       <param name="output_ports" value="7"/>
+                       <!-- For bus the I/O ports should be 1 -->
+                       <param name="virtual_channel_per_port" value="2"/>
+                       <param name="input_buffer_entries_per_vc" value="128"/>
+                       <param name="flit_bits" value="40"/>
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+                               chip_coverage <=1 -->
+                       <param name="link_routing_over_percentage" value="1.0"/>
+                       <!-- Links can route over other components or occupy whole area.
+                               by default, 50% of the NoC global links routes over other 
+                               components -->
+                       <stat name="total_accesses" value="100000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="1"/>
+               </component>            
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="180"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="800"/><!--MHz-->
+                       <param name="peak_transfer_rate" value="1600"/><!--MB/S-->
+                       <param name="block_size" value="16"/><!--B-->
+                       <param name="number_mcs" value="2"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="2"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="32"/>
+                       <param name="addressbus_width" value="32"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="6666"/>
+                       <stat name="memory_reads" value="3333"/>
+                       <stat name="memory_writes" value="3333"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/Niagara1.xml b/ext/mcpat/Niagara1.xml
new file mode 100644 (file)
index 0000000..ae748e2
--- /dev/null
@@ -0,0 +1,442 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="8"/>
+               <param name="number_of_L1Directories" value="4"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="4"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="90"/><!-- nm -->
+               <param name="target_core_clockrate" value="1200"/><!--MHz -->
+               <param name="temperature" value="380"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="1200"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="9"/>
+                       <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+                       <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+                       <!-- inorder/OoO -->
+                       <param name="number_hardware_threads" value="4"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="1"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="1"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="1"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="1"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="0"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="6,6"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="1"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="0.125"/>              
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="16"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="16"/>
+                       <param name="fp_instruction_window_size" value="16"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="80"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="8"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="32"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="32"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="800000"/>
+                       <stat name="int_instructions" value="600000"/>
+                       <stat name="fp_instructions" value="20000"/>
+                       <stat name="branch_instructions" value="0"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="100000"/>
+                       <stat name="store_instructions" value="100000"/>
+                       <stat name="committed_instructions" value="800000"/>
+                       <stat name="committed_int_instructions" value="600000"/>
+                       <stat name="committed_fp_instructions" value="20000"/>
+                       <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="263886"/>
+                       <stat name="ROB_writes" value="263886"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_accesses" value="263886"/>
+                       <stat name="fp_rename_accesses" value="263886"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="263886"/>
+                       <stat name="inst_window_writes" value="263886"/>
+                       <stat name="inst_window_wakeup_accesses" value="263886"/>
+                       <stat name="fp_inst_window_reads" value="263886"/>
+                       <stat name="fp_inst_window_writes" value="263886"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="1600000"/>
+                       <stat name="float_regfile_reads" value="40000"/>
+                       <stat name="int_regfile_writes" value="800000"/>
+                       <stat name="float_regfile_writes" value="20000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="800000"/>                     
+                       <stat name="fpu_accesses" value="10000"/>
+                       <stat name="mul_accesses" value="100000"/>
+                       <stat name="cdb_alu_accesses" value="1000000"/>
+                       <stat name="cdb_mul_accesses" value="0"/>
+                       <stat name="cdb_fpu_accesses" value="0"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.25"/>                      
+                       <stat name="LSU_duty_cycle" value="0.25"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.25"/>
+                       <stat name="ALU_duty_cycle" value="0.9"/>
+                       <stat name="MUL_duty_cycle" value="0.5"/>
+                       <stat name="FPU_duty_cycle" value="0.4"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.9"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.5"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.4"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="800000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="8192,4,2,1, 1,3"/>
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="1200"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                               <stat name="duty_cycle" value="0.45"/>  
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="1200"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.45"/>              
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
+                           <!-- consider 4-way bank interleaving for Niagara 1 -->
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="1200"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="0"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="write_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="0.5"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                   <stat name="duty_cycle" value="0.35"/>      
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="1200"/>
+                       <param name="type" value="1"/>
+                       <!-- 1 NoC, O bus -->
+                       <param name="horizontal_nodes" value="2"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="8"/>
+                       <param name="output_ports" value="5"/>
+                       <param name="virtual_channel_per_port" value="1"/>
+                       <!-- input buffer; in classic routers only input ports need buffers -->
+                       <param name="flit_bits" value="136"/>
+                       <param name="input_buffer_entries_per_vc" value="2"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+                       <stat name="total_accesses" value="360000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.6"/>
+               </component>
+               
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--B-->
+                       <param name="number_mcs" value="4"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/Niagara1_sharing.xml b/ext/mcpat/Niagara1_sharing.xml
new file mode 100644 (file)
index 0000000..93531ae
--- /dev/null
@@ -0,0 +1,400 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="64"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="22"/><!-- nm -->
+               <param name="target_core_clockrate" value="3500"/><!--MHz -->
+               <param name="temperature" value="360"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="3500"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="9"/>
+                       <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+                       <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+                       <!-- inorder/OoO -->
+                       <param name="number_hardware_threads" value="4"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="1"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="1"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="1"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="1"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="0"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="6,6"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="1"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="0.125"/>              
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="16"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="16"/>
+                       <param name="fp_instruction_window_size" value="16"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="80"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="8"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="32"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="32"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="800000"/>
+                       <stat name="int_instructions" value="600000"/>
+                       <stat name="fp_instructions" value="20000"/>
+                       <stat name="branch_instructions" value="0"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="100000"/>
+                       <stat name="store_instructions" value="100000"/>
+                       <stat name="committed_instructions" value="800000"/>
+                       <stat name="committed_int_instructions" value="600000"/>
+                       <stat name="committed_fp_instructions" value="20000"/>
+                       <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="263886"/>
+                       <stat name="ROB_writes" value="263886"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_accesses" value="263886"/>
+                       <stat name="fp_rename_accesses" value="263886"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="263886"/>
+                       <stat name="inst_window_writes" value="263886"/>
+                       <stat name="inst_window_wakeup_accesses" value="263886"/>
+                       <stat name="fp_inst_window_reads" value="263886"/>
+                       <stat name="fp_inst_window_writes" value="263886"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="1600000"/>
+                       <stat name="float_regfile_reads" value="40000"/>
+                       <stat name="int_regfile_writes" value="800000"/>
+                       <stat name="float_regfile_writes" value="20000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="800000"/>                     
+                       <stat name="fpu_accesses" value="10000"/>
+                       <stat name="mul_accesses" value="100000"/>
+                       <stat name="cdb_alu_accesses" value="1000000"/>
+                       <stat name="cdb_mul_accesses" value="0"/>
+                       <stat name="cdb_fpu_accesses" value="0"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.25"/>                      
+                       <stat name="LSU_duty_cycle" value="0.25"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.25"/>
+                       <stat name="ALU_duty_cycle" value="0.9"/>
+                       <stat name="MUL_duty_cycle" value="0.5"/>
+                       <stat name="FPU_duty_cycle" value="0.4"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.9"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.5"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.4"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="800000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="8192,4,2,1, 1,3"/>
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                               <stat name="duty_cycle" value="0.45"/>  
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.45"/>              
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+                           <!-- consider 4-way bank interleaving for Niagara 1 -->
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="0"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="write_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="0.5"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                   <stat name="duty_cycle" value="0.35"/>      
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="3500"/>
+                       <param name="type" value="1"/>
+                       <!-- 1 NoC, O bus -->
+                       <param name="horizontal_nodes" value="8"/>
+                       <param name="vertical_nodes" value="8"/>
+                       <param name="has_global_link" value="1"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="5"/>
+                       <param name="output_ports" value="5"/>
+                       <param name="virtual_channel_per_port" value="1"/>
+                       <!-- input buffer; in classic routers only input ports need buffers -->
+                       <param name="flit_bits" value="256"/>
+                       <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+                       <stat name="total_accesses" value="360000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.1"/>
+               </component>
+               
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="llc_line_length" value="64"/><!--B-->
+                       <param name="number_mcs" value="4"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+       </component>
+</component>
diff --git a/ext/mcpat/Niagara1_sharing_DC.xml b/ext/mcpat/Niagara1_sharing_DC.xml
new file mode 100644 (file)
index 0000000..574ec81
--- /dev/null
@@ -0,0 +1,442 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="64"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="8"/>
+               <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="22"/><!-- nm -->
+               <param name="target_core_clockrate" value="3500"/><!--MHz -->
+               <param name="temperature" value="360"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="3500"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="9"/>
+                       <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+                       <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+                       <!-- inorder/OoO -->
+                       <param name="number_hardware_threads" value="4"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="1"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="1"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="1"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="1"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="0"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="6,6"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="1"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="0.125"/>              
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="16"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="16"/>
+                       <param name="fp_instruction_window_size" value="16"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="80"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="8"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="32"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="32"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="800000"/>
+                       <stat name="int_instructions" value="600000"/>
+                       <stat name="fp_instructions" value="20000"/>
+                       <stat name="branch_instructions" value="0"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="100000"/>
+                       <stat name="store_instructions" value="100000"/>
+                       <stat name="committed_instructions" value="800000"/>
+                       <stat name="committed_int_instructions" value="600000"/>
+                       <stat name="committed_fp_instructions" value="20000"/>
+                       <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="263886"/>
+                       <stat name="ROB_writes" value="263886"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_accesses" value="263886"/>
+                       <stat name="fp_rename_accesses" value="263886"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="263886"/>
+                       <stat name="inst_window_writes" value="263886"/>
+                       <stat name="inst_window_wakeup_accesses" value="263886"/>
+                       <stat name="fp_inst_window_reads" value="263886"/>
+                       <stat name="fp_inst_window_writes" value="263886"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="1600000"/>
+                       <stat name="float_regfile_reads" value="40000"/>
+                       <stat name="int_regfile_writes" value="800000"/>
+                       <stat name="float_regfile_writes" value="20000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="800000"/>                     
+                       <stat name="fpu_accesses" value="10000"/>
+                       <stat name="mul_accesses" value="100000"/>
+                       <stat name="cdb_alu_accesses" value="1000000"/>
+                       <stat name="cdb_mul_accesses" value="0"/>
+                       <stat name="cdb_fpu_accesses" value="0"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.25"/>                      
+                       <stat name="LSU_duty_cycle" value="0.25"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.25"/>
+                       <stat name="ALU_duty_cycle" value="0.9"/>
+                       <stat name="MUL_duty_cycle" value="0.5"/>
+                       <stat name="FPU_duty_cycle" value="0.4"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.9"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.5"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.4"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="800000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="8192,4,2,1, 1,3"/>
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                               <stat name="duty_cycle" value="0.45"/>  
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,9,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.45"/>              
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+                           <!-- consider 4-way bank interleaving for Niagara 1 -->
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="0"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="write_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="0.5"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                   <stat name="duty_cycle" value="0.35"/>      
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="3500"/>
+                       <param name="type" value="1"/>
+                       <!-- 1 NoC, O bus -->
+                       <param name="horizontal_nodes" value="8"/>
+                       <param name="vertical_nodes" value="8"/>
+                       <param name="has_global_link" value="1"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="5"/>
+                       <param name="output_ports" value="5"/>
+                       <param name="virtual_channel_per_port" value="1"/>
+                       <!-- input buffer; in classic routers only input ports need buffers -->
+                       <param name="flit_bits" value="256"/>
+                       <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+                       <stat name="total_accesses" value="360000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.1"/>
+               </component>
+               
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--B-->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/Niagara1_sharing_SBT.xml b/ext/mcpat/Niagara1_sharing_SBT.xml
new file mode 100644 (file)
index 0000000..32eeca3
--- /dev/null
@@ -0,0 +1,455 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="64"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="22"/><!-- nm -->
+               <param name="target_core_clockrate" value="3500"/><!--MHz -->
+               <param name="temperature" value="360"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="3500"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="9"/>
+                       <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+                       <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+                       <!-- inorder/OoO -->
+                       <param name="number_hardware_threads" value="4"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="1"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="1"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="1"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="1"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="0"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="6,6"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="1"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="0.125"/>              
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="16"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="16"/>
+                       <param name="fp_instruction_window_size" value="16"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="80"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="8"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="32"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="32"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="800000"/>
+                       <stat name="int_instructions" value="600000"/>
+                       <stat name="fp_instructions" value="20000"/>
+                       <stat name="branch_instructions" value="0"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="100000"/>
+                       <stat name="store_instructions" value="100000"/>
+                       <stat name="committed_instructions" value="800000"/>
+                       <stat name="committed_int_instructions" value="600000"/>
+                       <stat name="committed_fp_instructions" value="20000"/>
+                       <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="263886"/>
+                       <stat name="ROB_writes" value="263886"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_accesses" value="263886"/>
+                       <stat name="fp_rename_accesses" value="263886"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="263886"/>
+                       <stat name="inst_window_writes" value="263886"/>
+                       <stat name="inst_window_wakeup_accesses" value="263886"/>
+                       <stat name="fp_inst_window_reads" value="263886"/>
+                       <stat name="fp_inst_window_writes" value="263886"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="1600000"/>
+                       <stat name="float_regfile_reads" value="40000"/>
+                       <stat name="int_regfile_writes" value="800000"/>
+                       <stat name="float_regfile_writes" value="20000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="800000"/>                     
+                       <stat name="fpu_accesses" value="10000"/>
+                       <stat name="mul_accesses" value="100000"/>
+                       <stat name="cdb_alu_accesses" value="1000000"/>
+                       <stat name="cdb_mul_accesses" value="0"/>
+                       <stat name="cdb_fpu_accesses" value="0"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.25"/>                      
+                       <stat name="LSU_duty_cycle" value="0.25"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.25"/>
+                       <stat name="ALU_duty_cycle" value="0.9"/>
+                       <stat name="MUL_duty_cycle" value="0.5"/>
+                       <stat name="FPU_duty_cycle" value="0.4"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.9"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.5"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.4"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="800000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="8192,4,2,1, 1,3"/>
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->    
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                               <stat name="duty_cycle" value="0.45"/>  
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->    
+                               <param name="Dir_config" value="8388608,9,0,1,100, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,8"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.45"/>              
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                           <param name="merged_dir" value="1"/><!--if static bank tag is used as the directory -->
+                               <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+                           <!-- consider 4-way bank interleaving for Niagara 1 -->
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="0"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="write_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="0.5"/>       
+                               <stat name="coherent_read_accesses" value="400000"/>
+                               <stat name="coherent_write_accesses" value="0"/>
+                               <stat name="coherent_read_misses" value="400000"/>
+                               <stat name="coherent_write_misses" value="0"/>
+                           <stat name="dir_duty_cycle" value="0.5"/>
+                       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                   <stat name="duty_cycle" value="0.35"/>
+                               <param name="Merged_dir" value="1"/><!--if static bank tag is used as the directory -->
+                               <stat name="coherent_read_accesses" value="400000"/>
+                               <stat name="coherent_write_accesses" value="0"/>
+                               <stat name="coherent_read_misses" value="400000"/>
+                               <stat name="coherent_write_misses" value="0"/>
+                           <stat name="dir_duty_cycle" value="0.5"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="3500"/>
+                       <param name="type" value="1"/>
+                       <!-- 1 NoC, O bus -->
+                       <param name="horizontal_nodes" value="8"/>
+                       <param name="vertical_nodes" value="8"/>
+                       <param name="has_global_link" value="1"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="5"/>
+                       <param name="output_ports" value="5"/>
+                       <param name="virtual_channel_per_port" value="1"/>
+                       <!-- input buffer; in classic routers only input ports need buffers -->
+                       <param name="flit_bits" value="256"/>
+                       <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+                       <stat name="total_accesses" value="360000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.1"/>
+               </component>
+               
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--B-->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/Niagara1_sharing_ST.xml b/ext/mcpat/Niagara1_sharing_ST.xml
new file mode 100644 (file)
index 0000000..3f0573f
--- /dev/null
@@ -0,0 +1,443 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="64"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="1"/>
+               <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="22"/><!-- nm -->
+               <param name="target_core_clockrate" value="3500"/><!--MHz -->
+               <param name="temperature" value="360"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="3500"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="9"/>
+                       <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+                       <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+                       <!-- inorder/OoO -->
+                       <param name="number_hardware_threads" value="4"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="1"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="1"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="1"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="1"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="0"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="6,6"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="1"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="0.125"/>              
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="16"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="16"/>
+                       <param name="fp_instruction_window_size" value="16"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="80"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="8"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="32"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="32"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="800000"/>
+                       <stat name="int_instructions" value="600000"/>
+                       <stat name="fp_instructions" value="20000"/>
+                       <stat name="branch_instructions" value="0"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="100000"/>
+                       <stat name="store_instructions" value="100000"/>
+                       <stat name="committed_instructions" value="800000"/>
+                       <stat name="committed_int_instructions" value="600000"/>
+                       <stat name="committed_fp_instructions" value="20000"/>
+                       <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="263886"/>
+                       <stat name="ROB_writes" value="263886"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_accesses" value="263886"/>
+                       <stat name="fp_rename_accesses" value="263886"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="263886"/>
+                       <stat name="inst_window_writes" value="263886"/>
+                       <stat name="inst_window_wakeup_accesses" value="263886"/>
+                       <stat name="fp_inst_window_reads" value="263886"/>
+                       <stat name="fp_inst_window_writes" value="263886"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="1600000"/>
+                       <stat name="float_regfile_reads" value="40000"/>
+                       <stat name="int_regfile_writes" value="800000"/>
+                       <stat name="float_regfile_writes" value="20000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="800000"/>                     
+                       <stat name="fpu_accesses" value="10000"/>
+                       <stat name="mul_accesses" value="100000"/>
+                       <stat name="cdb_alu_accesses" value="1000000"/>
+                       <stat name="cdb_mul_accesses" value="0"/>
+                       <stat name="cdb_fpu_accesses" value="0"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.25"/>                      
+                       <stat name="LSU_duty_cycle" value="0.25"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.25"/>
+                       <stat name="ALU_duty_cycle" value="0.9"/>
+                       <stat name="MUL_duty_cycle" value="0.5"/>
+                       <stat name="FPU_duty_cycle" value="0.4"/>
+                       <stat name="ALU_cdb_duty_cycle" value="0.9"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.5"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.4"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="800000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="8192,4,2,1, 1,3"/>
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->    
+                               <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+                               <stat name="duty_cycle" value="0.45"/>  
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->    
+                               <param name="Dir_config" value="8388608,9,0,1,100, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3500"/>
+                               <param name="ports" value="0,0,8"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>
+                           <stat name="duty_cycle" value="0.45"/>              
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+                           <param name="Merged_dir" value="1"/>
+                           <!-- consider 4-way bank interleaving for Niagara 1 -->
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="0"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="write_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="0.5"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="Merged_dir" value="1"/>
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                   <stat name="duty_cycle" value="0.35"/>      
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="3500"/>
+                       <param name="type" value="1"/>
+                       <!-- 1 NoC, O bus -->
+                       <param name="horizontal_nodes" value="8"/>
+                       <param name="vertical_nodes" value="8"/>
+                       <param name="has_global_link" value="1"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="5"/>
+                       <param name="output_ports" value="5"/>
+                       <param name="virtual_channel_per_port" value="1"/>
+                       <!-- input buffer; in classic routers only input ports need buffers -->
+                       <param name="flit_bits" value="256"/>
+                       <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+                       <stat name="total_accesses" value="360000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="0.1"/>
+               </component>
+               
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--B-->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+               </component>
+</component>
\ No newline at end of file
diff --git a/ext/mcpat/Niagara2.xml b/ext/mcpat/Niagara2.xml
new file mode 100644 (file)
index 0000000..c7e311f
--- /dev/null
@@ -0,0 +1,438 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="8"/>
+               <param name="number_of_L1Directories" value="8"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="8"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="65"/><!-- nm -->
+               <param name="target_core_clockrate" value="1400"/><!--MHz -->
+               <param name="temperature" value="380"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="1400"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="9"/>
+                       <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+                       <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+                       <!-- inorder/OoO -->
+                       <param name="number_hardware_threads" value="4"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="1"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="1"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="1"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="1"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="1"/>
+                       <param name="prediction_width" value="0"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="2,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="8,8"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="2"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="0"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="1"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="16"/>
+                       <param name="fp_instruction_window_size" value="16"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="80"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="32"/>                  
+                       <param name="archi_Regs_FRF_size" value="32"/>
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="80"/>
+                       <param name="phy_Regs_FRF_size" value="80"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="8"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="64"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="64"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="1"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="32"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="1600000"/>
+                       <stat name="int_instructions" value="1200000"/>
+                       <stat name="fp_instructions" value="40000"/>
+                       <stat name="branch_instructions" value="0"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="200000"/>
+                       <stat name="store_instructions" value="200000"/>
+                       <stat name="committed_instructions" value="1600000"/>
+                       <stat name="committed_int_instructions" value="1200000"/>
+                       <stat name="committed_fp_instructions" value="40000"/>
+                       <stat name="pipeline_duty_cycle" value="0.5"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="263886"/>
+                       <stat name="ROB_writes" value="263886"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_accesses" value="263886"/>
+                       <stat name="fp_rename_accesses" value="263886"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="263886"/>
+                       <stat name="inst_window_writes" value="263886"/>
+                       <stat name="inst_window_wakeup_accesses" value="263886"/>
+                       <stat name="fp_inst_window_reads" value="263886"/>
+                       <stat name="fp_inst_window_writes" value="263886"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="3200000"/>
+                       <stat name="float_regfile_reads" value="80000"/>
+                       <stat name="int_regfile_writes" value="1600000"/>
+                       <stat name="float_regfile_writes" value="40000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="1600000"/>                    
+                       <stat name="fpu_accesses" value="10000"/>
+                       <stat name="mul_accesses" value="100000"/>
+                       <stat name="cdb_alu_accesses" value="1200000"/>
+                       <stat name="cdb_mul_accesses" value="0"/>
+                       <stat name="cdb_fpu_accesses" value="0"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="0.5"/>                       
+                       <stat name="LSU_duty_cycle" value="0.25"/>
+                       <stat name="MemManU_I_duty_cycle" value="0.5"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.25"/>
+                       <stat name="ALU_duty_cycle" value="0.9"/>
+                       <stat name="MUL_duty_cycle" value="0"/>
+                       <stat name="FPU_duty_cycle" value="0.6"/>
+                       <!--FPU also handles Mul/div -->
+                       <stat name="ALU_cdb_duty_cycle" value="0.9"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.6"/>   
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="64"/>
+                               <stat name="total_accesses" value="800000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="16384,32,8,1,1,7,8,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="128"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="8192,16,4,1, 1,3, 16,0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="8192,4,2,1, 1,3"/>
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1024,2,0,1,1,1, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="1400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="1400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>                        
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="524228,64,16,1, 8,23, 64,1"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="1400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="400000"/>
+                               <stat name="write_accesses" value="0"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="write_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1"/> 
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="1048576,64,16,1, 2,100, 64, 1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="3500"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="0.35"/>                              
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="1400"/>
+                       <param name="horizontal_nodes" value="2"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="9"/>
+                       <param name="output_ports" value="8"/>
+                       <param name="virtual_channel_per_port" value="1"/>
+                       <!-- input buffer; in classic routers only input ports need buffers -->
+                       <param name="flit_bits" value="136"/>
+                       <param name="input_buffer_entries_per_vc" value="16"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+                       <stat name="total_accesses" value="160000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                   <stat name="duty_cycle" value="0.1"/>
+               </component>
+               
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="400"/><!--MHz-->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
+                       <param name="number_mcs" value="4"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="66666"/>
+                       <stat name="memory_reads" value="33333"/>
+                       <stat name="memory_writes" value="33333"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="2"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="1"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
diff --git a/ext/mcpat/Penryn.xml b/ext/mcpat/Penryn.xml
new file mode 100644 (file)
index 0000000..fe9715b
--- /dev/null
@@ -0,0 +1,456 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="2"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+               <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="45"/><!-- nm -->
+               <param name="target_core_clockrate" value="3700"/><!--MHz -->
+               <param name="temperature" value="380"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="2"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="3700"/>
+                       <!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+                       <param name="opt_local" value="1"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="16"/>
+                       <param name="x86" value="1"/>
+                       <param name="micro_opcode_width" value="8"/>
+                       <param name="machine_type" value="0"/>
+                       <!-- inorder/OoO; 1 inorder; 0 OOO-->
+                       <param name="number_hardware_threads" value="1"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="4"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="4"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="4"/>
+                       <param name="peak_issue_width" value="6"/><!--As shown in Wiki figure which has max 5 ports, store data/address is modeled 
+                                                                                                                 as a single port.-->
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="4"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="2"/>
+                       <param name="prediction_width" value="1"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="14,14"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="6"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="2"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/><!--Inst. + micro-op -->
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="1"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="32"/>
+                       <param name="fp_instruction_window_size" value="32"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="96"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->                 
+                       <param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="256"/>
+                       <param name="phy_Regs_FRF_size" value="256"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="0"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="96"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="48"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="2"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="64"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="400000"/>
+                       <stat name="int_instructions" value="200000"/>
+                       <stat name="fp_instructions" value="100000"/>
+                       <stat name="branch_instructions" value="100000"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="0"/>
+                       <stat name="store_instructions" value="50000"/>
+                       <stat name="committed_instructions" value="400000"/>
+                       <stat name="committed_int_instructions" value="200000"/>
+                       <stat name="committed_fp_instructions" value="100000"/>
+                       <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="400000"/>
+                       <stat name="ROB_writes" value="400000"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+                       <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+                       <stat name="fp_rename_reads" value="200000"/>
+                       <stat name="fp_rename_writes" value="100000"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="400000"/>
+                       <stat name="inst_window_writes" value="400000"/>
+                       <stat name="inst_window_wakeup_accesses" value="800000"/>
+                       <stat name="fp_inst_window_reads" value="200000"/>
+                       <stat name="fp_inst_window_writes" value="200000"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="600000"/>
+                       <stat name="float_regfile_reads" value="100000"/>
+                       <stat name="int_regfile_writes" value="300000"/>
+                       <stat name="float_regfile_writes" value="50000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="300000"/>                     
+                       <stat name="fpu_accesses" value="100000"/>
+                       <stat name="mul_accesses" value="200000"/>
+                       <stat name="cdb_alu_accesses" value="300000"/>
+                       <stat name="cdb_mul_accesses" value="200000"/>
+                       <stat name="cdb_fpu_accesses" value="100000"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="1"/>                 
+                       <stat name="LSU_duty_cycle" value="0.5"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.5"/>
+                       <stat name="ALU_duty_cycle" value="1"/>
+                       <stat name="MUL_duty_cycle" value="0.3"/>
+                       <stat name="FPU_duty_cycle" value="0.3"/>
+                       <stat name="ALU_cdb_duty_cycle" value="1"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.3"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.3"/>
+                       <param name="number_of_BPT" value="2"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="128"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="32768,32,8,1,4,4,32,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="256"/><!--dual threads-->
+                               <stat name="total_accesses" value="400000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="32768,32,8,1, 4,6, 32,1 "/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <param name="number_of_BTB" value="2"/>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+                               <stat name="write_accesses" value="0"/>
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>    
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="6291456,64, 16, 8, 8, 23, 32, 1"/> 
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3700"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1.0"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="850"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="11824"/>
+                               <stat name="write_accesses" value="11276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                               <stat name="duty_cycle" value="1.0"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="3400"/>
+                       <param name="type" value="0"/>
+                       <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+                               at each time only one node can send req -->
+                       <param name="horizontal_nodes" value="1"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="1"/>
+                       <param name="output_ports" value="1"/>
+                       <!-- For bus the I/O ports should be 1 -->
+                       <param name="flit_bits" value="256"/>
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+                               chip_coverage <=1 -->
+                       <param name="link_routing_over_percentage" value="0.5"/>
+                       <!-- Links can route over other components or occupy whole area.
+                               by default, 50% of the NoC global links routes over other 
+                               components -->
+                       <stat name="total_accesses" value="100000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="1"/>
+               </component>            
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--B-->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
+
diff --git a/ext/mcpat/README b/ext/mcpat/README
new file mode 100644 (file)
index 0000000..4887b10
--- /dev/null
@@ -0,0 +1,226 @@
+ __  __      ____   _  _____   ____       _         
+|  \/  | ___|  _ \ / \|_   _| | __ )  ___| |_  __ _ 
+| |\/| |/ __| |_) / _ \ | |   |  _ \ / _ \ __|/ _` |
+| |  | | (__|  __/ ___ \| |   | |_) |  __/ |_| (_| |
+|_|  |_|\___|_| /_/   \_\_|   |____/ \___|\__|\__,_|
+
+McPAT: Multicore Power, Area, and Timing
+Current version 0.8Beta 
+===============================
+
+McPAT is an architectural modeling tool for chip multiprocessors (CMP)
+The main focus of McPAT is accurate power and area
+modeling, and a target clock rate is used as a design constraint. 
+McPAT performs automatic extensive search to find optimal designs 
+that satisfy the target clock frequency.  
+
+For complete documentation of the McPAT, please refer McPAT 1.0
+technical report and the following paper,
+"McPAT: An Integrated Power, Area, and Timing Modeling
+ Framework for Multicore and Manycore Architectures", 
+that appears in MICRO 2009. Please cite the paper, if you use
+McPAT in your work. The bibtex entry is provided below for your convenience.
+
+ @inproceedings{mcpat:micro,
+ author = {Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay B. Brockman and Dean M. Tullsen and Norman P. Jouppi},
+ title =  "{McPAT: An Integrated Power, Area, and Timing Modeling Framework for Multicore and Manycore Architectures}",
+ booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture},
+ year = {2009},
+ pages = {469--480},
+ }
+
+Current McPAT is in its beta release. 
+List of features of beta release
+===============================
+The following are the list of features supported by the tool. 
+
+* Power, area, and timing models for CMPs with:
+      Inorder cores both single and multithreaded
+      OOO cores both single and multithreaded
+      Shared/coherent caches with directory hardware:
+       including directory cache, shadowed tag directory
+       and static bank mapped tag directory
+      Network-on-Chip
+      On-chip memory controllers
+    
+* Internal models are based on real modern processors:
+  Inorder models are based on Sun Niagara family
+  OOO models are based on Intel P6 for reservation 
+  station based OOO cores, and on Intel Netburst and 
+  Alpha 21264 for physical register file based OOO cores.     
+
+* Leakage power modeling considers both sub-threshold leakage 
+  and gate leakage power. The impact of operating temperature 
+  on both leakage power are considered. Longer channel devices 
+  that can reduce leakage significantly with modest performance 
+  penalty are also modeled.
+  
+* McPAT supports automatic extensive search to find optimal designs 
+  that satisfy the target clock frequency. The timing constraint 
+  include both throughput and latency.
+
+* Interconnect model with different delay, power, and area 
+  properties, as well as both the aggressive and conservative 
+  interconnect projections on wire technologies. 
+
+* All process specific values used by the McPAT are obtained
+  from ITRS and currently, the McPAT supports 90nm, 65nm, 45nm, 
+  32nm, and 22nm technology nodes. At 32nm and 22nm nodes, SOI 
+  and DG devices are used. After 45nm, Hi-K metal gates are used.
+
+How to use the tool?
+====================
+
+McPAT takes input parameters from an XML-based interface,
+then it computes area and peak power of the 
+Please note that the peak power is the absolute worst case power, 
+which could be even higher than TDP. 
+
+1. Steps to run McPAT:
+   -> define the target processor using inorder.xml or OOO.xml 
+   -> run the "mcpat" binary:
+      ./mcpat -infile <*.xml>  -print_level < level of detailed output>
+      ./mcpat -h (or mcpat --help) will show the quick help message.
+
+   Rather than being hardwired to certain simulators, McPAT 
+   uses an XML-based interface to enable easy integration
+   with various performance simulators. Our collaborator, 
+   Richard Strong, at University of California, San Diego, 
+   designed an experimental parser for the M5 simulator, aiming for 
+   streamlining the integration of McPAT and M5. Please check the M5 
+   repository/ for the latest version of the parser.
+   
+2. Optimize:
+   McPAT will try its best to satisfy the target clock rate. 
+   When it cannot find a valid solution, it gives out warnings, 
+   while still giving a solution that is closest to the timing 
+   constraints and calculate power based on it. The optimization 
+   will lead to larger power/area numbers for target higher clock
+   rate. McPAT also provides the option "-opt_for_clk" to turn on 
+   ("-opt_for_clk 1") and off this strict optimization for the 
+   timing constraint. When it is off, McPAT always optimize 
+   component for ED^2P without worrying about meeting the 
+   target clock frequency. By turning it off, the computation time 
+   can be reduced, which suites for situations where target clock rate
+   is conservative.
+  
+3. The output:
+   McPAT outputs results in a hierarchical manner. Increasing 
+   the "-print_level" will show detailed results inside each 
+   component. For each component, major parts are shown, and associated 
+   pipeline registers/control logic are added up in total area/power of each 
+   components. In general, McPAT does not model the area/overhead of the pad 
+   frame used in a processor die.
+   
+4. How to use the XML interface for McPAT 
+   4.1 Set up the parameters
+               Parameters of target designs need to be set in the *.xml file for 
+               entries taged as "param". McPAT have very detailed parameter settings. 
+               please remove the structure parameter from the file if you want 
+               to use the default values. Otherwise, the parameters in the xml file 
+               will override the default values. 
+   
+   4.2 Pass the statistics
+               There are two options to get the correct stats: a) the performance 
+               simulator can capture all the stats in detail and pass them to McPAT;
+               b). Performance simulator can only capture partial stats and pass 
+               them to McPAT, while McPAT can reason about the complete stats using 
+        the partial information and the configuration. Therefore, there are 
+        some overlap for the stats. 
+   
+   4.3 Interface XML file structures (PLEASE READ!)
+                       The XML is hierarchical from processor level to micro-architecture 
+               level. McPAT support both heterogeneous and homogeneous manycore processors. 
+               
+                       1). For heterogeneous processor setup, each component (core, NoC, cache, 
+               and etc) must have its own instantiations (core0, core1, ..., coreN). 
+               Each instantiation will have different parameters as well as its stats.
+               Thus, the XML file must have multiple "instantiation" of each type of 
+               heterogeneous components and the corresponding hetero flags must be set 
+               in the XML file. Then state in the XML should be the stats of "a" instantiation 
+               (e.g. "a" cores). The reported runtime dynamic is of a single instantiation 
+               (e.g. "a" cores). Since the stats for each (e.g. "a" cores) may be different,
+               we will see a whole list of (e.g. "a" cores) with different dynamic power,
+               and total power is just a sum of them.  
+               
+                       2). For homogeneous processors, the same method for heterogeneous can 
+               also be used by treating all homogeneous instantiations as heterogeneous. 
+               However, a preferred approach is to use a single representative for all 
+               the same components (e.g. core0 to represent all cores) and set the 
+               processor to have homogeneous components (e.g. <param name="homogeneous_cores
+               " value="1"/> ). Thus, the XML file only has one instantiation to represent 
+               all others with the same architectural parameters. The corresponding homo 
+               flags must be set in the XML file.  Then, the stats in the XML should be 
+               the aggregated stats of the sum of all instantiations (e.g. aggregated stats 
+               of all cores). In the final results, McPAT will only report a single 
+               instantiation of each type of component, and the reported runtime dynamic power
+               is the sum of all instantiations of the same type. This approach can run fast 
+               and use much less memory.        
+
+5. Guide for integrating McPAT into performance simulators and bypassing the XML interface
+               The detailed work flow of McPAT has two phases: the initialization phase and
+   the computation phase. Specifically, in order to start the initialization phase a 
+   user specifies static configurations, including parameters at all three levels, 
+   namely, architectural, circuit, and technology levels. During the initialization 
+   phase, McPAT will generate the internal chip representation using the configurations 
+   set by the user. 
+               The computation phase of McPAT is called by McPAT or the performance simulator 
+   during simulation to generate runtime power numbers. Before calling McPAT to 
+   compute runtime power numbers, the performance simulator needs to pass the 
+   statistics, namely, the activity factors of each individual components to McPAT 
+   via the XML interface. 
+               The initialization phase is very time-consuming, since it will repeat many 
+   times until valid configurations are found or the possible configurations are 
+   exhausted. To reduce the overhead, a user can let the simulator to call McPAT 
+   directly for computation phase and only call initialization phase once at the 
+   beginning of simulation. In this case, the XML interface file is bypassed, 
+   please refer to processor.cc to see how the two phases are called.
+   
+6. Sample input files:
+   This package provide sample XML files for validating target processors. Please find the 
+   enclosed Niagara1.xml (for the Sun Niagara1 processor), Niagara2.xml (for the Sun Niagara2 
+   processor), Alpha21364.xml (for the Alpha21364 processor), and Xeon.xml (for the Intel 
+   Xeon Tulsa processor). 
+   
+   Special instructions for using Xeon.xml:
+   McPAT uses ITRS device types including HP, LSTP, and LOP. Although most 
+   designs follow ITRS projections, there are designs with special technologies. 
+   For example, the 65nm Xeon Tulsa processor uses 1.25 V rather than 1.1V 
+   for the core voltage domain, which results in the changes in threshold voltage,
+   leakage current density, saturation current, and etc, besides the different 
+   supply voltage. We use MASTAR to match the special technology as used in Xeon 
+   core domain. Therefore, in order to generate accurate results of Xeon 
+   Tulsa cores, users need to do make TAR=mcpatXeonCore and use the generated 
+   special executable. The L3 cache and buses must be computed using standard 
+   ITRS technology.    
+    
+
+====================
+McPAT is in its beginning stage. We are still improving 
+the tool and refining the code. Please come back to its website 
+for newer versions. If you have any comments, 
+questions, or suggestions, please write to us.
+
+Version history and roadmap
+
+McPAT Alpha:      released Sep. 2009 Experimental release
+McPAT Beta (0.6): released Nov. 2009 New code base and technology base
+McPAT Beta (0.7): released May. 2010 Added various new models, 
+                  including long channel devices, buses model; together
+                  with bug fixes and extensive code optimization to reduce 
+                  memory usage.  
+McPAT Beta (0.8): released Aug. 2010 Added various new models, 
+                  including on-chip 10Gb ethernet units, PCIe, and flash controllers.
+Next major release:     
+McPAT 1.0:        including advance power-saving states
+
+Future releases may include the modeling of embedded low-power 
+processors as well as vector processors and GPGPUs.             
+                  
+
+Sheng Li             
+sheng.li@hp.com 
+
+
+
+
diff --git a/ext/mcpat/XML_Parse.cc b/ext/mcpat/XML_Parse.cc
new file mode 100644 (file)
index 0000000..ae3ee6f
--- /dev/null
@@ -0,0 +1,1798 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include <cstdio>
+#include <string>
+
+#include "XML_Parse.h"
+#include "xmlParser.h"
+
+using namespace std;
+
+void ParseXML::parse(char* filepath)
+{
+        unsigned int i,j,k,m,n;
+        unsigned int NumofCom_4;
+        unsigned int itmp;
+        //Initialize all structures
+        ParseXML::initialize();
+
+        // this open and parse the XML file:
+        XMLNode xMainNode=XMLNode::openFileHelper(filepath,"component"); //the 'component' in the first layer
+
+        XMLNode xNode2=xMainNode.getChildNode("component"); // the 'component' in the second layer
+        //get all params in the second layer
+        itmp=xNode2.nChildNode("param");
+        for(i=0; i<itmp; i++)
+        {
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_cores")==0) {sys.number_of_cores=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L1Directories")==0) {sys.number_of_L1Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L2Directories")==0) {sys.number_of_L2Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L2s")==0) {sys.number_of_L2s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Private_L2")==0) {sys.Private_L2=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L3s")==0) {sys.number_of_L3s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_NoCs")==0) {sys.number_of_NoCs=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_dir_levels")==0) {sys.number_of_dir_levels=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"domain_size")==0) {sys.domain_size=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"first_level_dir")==0) {sys.first_level_dir=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_cores")==0) {sys.homogeneous_cores=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"core_tech_node")==0) {sys.core_tech_node=atof(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"target_core_clockrate")==0) {sys.target_core_clockrate=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"target_chip_area")==0) {sys.target_chip_area=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"temperature")==0) {sys.temperature=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_cache_levels")==0) {sys.number_cache_levels=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"L1_property")==0) {sys.L1_property =atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"L2_property")==0) {sys.L2_property =atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L2s")==0) {sys.homogeneous_L2s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L1Directories")==0) {sys.homogeneous_L1Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L2Directories")==0) {sys.homogeneous_L2Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"L3_property")==0) {sys.L3_property =atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L3s")==0) {sys.homogeneous_L3s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_ccs")==0) {sys.homogeneous_ccs=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_NoCs")==0) {sys.homogeneous_NoCs=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Max_area_deviation")==0) {sys.Max_area_deviation=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Max_power_deviation")==0) {sys.Max_power_deviation=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"device_type")==0) {sys.device_type=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"longer_channel_device")==0) {sys.longer_channel_device=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_dynamic_power")==0) {sys.opt_dynamic_power=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_lakage_power")==0) {sys.opt_lakage_power=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_clockrate")==0) {sys.opt_clockrate=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_area")==0) {sys.opt_area=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Embedded")==0) {sys.Embedded=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"interconnect_projection_type")==0) {sys.interconnect_projection_type=atoi(xNode2.getChildNode("param",i).getAttribute("value"))==0?0:1;continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"machine_bits")==0) {sys.machine_bits=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"virtual_address_width")==0) {sys.virtual_address_width=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"physical_address_width")==0) {sys.physical_address_width=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+                if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"virtual_memory_page_size")==0) {sys.virtual_memory_page_size=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;}
+        }
+
+//     if (sys.Private_L2 && sys.number_of_cores!=sys.number_of_L2s)
+//     {
+//             cout<<"Private L2: Number of L2s must equal to Number of Cores"<<endl;
+//             exit(0);
+//     }
+
+        itmp=xNode2.nChildNode("stat");
+        for(i=0; i<itmp; i++)
+        {
+                if (strcmp(xNode2.getChildNode("stat",i).getAttribute("name"),"total_cycles")==0) {sys.total_cycles=atof(xNode2.getChildNode("stat",i).getAttribute("value"));continue;}
+
+        }
+
+        //get the number of components within the second layer
+        unsigned int NumofCom_3=xNode2.nChildNode("component");
+        XMLNode xNode3,xNode4; //define the third-layer(system.core0) and fourth-layer(system.core0.predictor) xnodes
+
+        string strtmp;
+        char chtmp[60];
+        char chtmp1[60];
+        chtmp1[0]='\0';
+        unsigned int OrderofComponents_3layer=0;
+        if (NumofCom_3>OrderofComponents_3layer)
+        {
+                //___________________________get all system.core0-n________________________________________________
+                if (sys.homogeneous_cores==1) OrderofComponents_3layer=0;
+                else OrderofComponents_3layer=sys.number_of_cores-1;
+                for (i=0; i<=OrderofComponents_3layer; i++)
+                {
+                        xNode3=xNode2.getChildNode("component",i);
+                        if (xNode3.isEmpty()==1) {
+                                printf("The value of homogeneous_cores or number_of_cores is not correct!");
+                                exit(0);
+                        }
+                        else{
+                                if (strstr(xNode3.getAttribute("name"),"core")!=NULL)
+                                {
+                                        { //For cpu0-cpui
+                                                //Get all params with system.core?
+                                                itmp=xNode3.nChildNode("param");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clock_rate")==0) {sys.core[i].clock_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"opt_local")==0) {sys.core[i].opt_local=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"x86")==0) {sys.core[i].x86=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"machine_bits")==0) {sys.core[i].machine_bits=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"virtual_address_width")==0) {sys.core[i].virtual_address_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"physical_address_width")==0) {sys.core[i].physical_address_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_length")==0) {sys.core[i].instruction_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"opcode_width")==0) {sys.core[i].opcode_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"micro_opcode_width")==0) {sys.core[i].micro_opcode_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"machine_type")==0) {sys.core[i].machine_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"internal_datapath_width")==0) {sys.core[i].internal_datapath_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_hardware_threads")==0) {sys.core[i].number_hardware_threads=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"fetch_width")==0) {sys.core[i].fetch_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_instruction_fetch_ports")==0) {sys.core[i].number_instruction_fetch_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"decode_width")==0) {sys.core[i].decode_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"issue_width")==0) {sys.core[i].issue_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_issue_width")==0) {sys.core[i].peak_issue_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"commit_width")==0) {sys.core[i].commit_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"fp_issue_width")==0) {sys.core[i].fp_issue_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"prediction_width")==0) {sys.core[i].prediction_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"pipelines_per_core")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.core[i].pipelines_per_core[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.core[i].pipelines_per_core[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"pipeline_depth")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.core[i].pipeline_depth[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.core[i].pipeline_depth[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"FPU")==0) {strcpy(sys.core[i].FPU,xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"divider_multiplier")==0) {strcpy(sys.core[i].divider_multiplier,xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ALU_per_core")==0) {sys.core[i].ALU_per_core=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"FPU_per_core")==0) {sys.core[i].FPU_per_core=atof(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"MUL_per_core")==0) {sys.core[i].MUL_per_core=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_buffer_size")==0) {sys.core[i].instruction_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"decoded_stream_buffer_size")==0) {sys.core[i].decoded_stream_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_window_scheme")==0) {sys.core[i].instruction_window_scheme  =atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_window_size")==0) {sys.core[i].instruction_window_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"fp_instruction_window_size")==0) {sys.core[i].fp_instruction_window_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ROB_size")==0) {sys.core[i].ROB_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"archi_Regs_IRF_size")==0) {sys.core[i].archi_Regs_IRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"archi_Regs_FRF_size")==0) {sys.core[i].archi_Regs_FRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"phy_Regs_IRF_size")==0) {sys.core[i].phy_Regs_IRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"phy_Regs_FRF_size")==0) {sys.core[i].phy_Regs_FRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"rename_scheme")==0) {sys.core[i].rename_scheme=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"register_windows_size")==0) {sys.core[i].register_windows_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LSU_order")==0) {strcpy(sys.core[i].LSU_order,xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"store_buffer_size")==0) {sys.core[i].store_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"load_buffer_size")==0) {sys.core[i].load_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_ports")==0) {sys.core[i].memory_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Dcache_dual_pump")==0) {strcpy(sys.core[i].Dcache_dual_pump,xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"RAS_size")==0) {sys.core[i].RAS_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                }
+                                                //Get all stats with system.core?
+                                                itmp=xNode3.nChildNode("stat");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_instructions")==0) {sys.core[i].total_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"int_instructions")==0) {sys.core[i].int_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_instructions")==0) {sys.core[i].fp_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"branch_instructions")==0) {sys.core[i].branch_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"branch_mispredictions")==0) {sys.core[i].branch_mispredictions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"committed_instructions")==0) {sys.core[i].committed_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"committed_int_instructions")==0) {sys.core[i].committed_int_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"committed_fp_instructions")==0) {sys.core[i].committed_fp_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_instructions")==0) {sys.core[i].load_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_instructions")==0) {sys.core[i].store_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_cycles")==0) {sys.core[i].total_cycles=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"idle_cycles")==0) {sys.core[i].idle_cycles=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"busy_cycles")==0) {sys.core[i].busy_cycles=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"instruction_buffer_reads")==0) {sys.core[i].instruction_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"instruction_buffer_write")==0) {sys.core[i].instruction_buffer_write=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ROB_reads")==0) {sys.core[i].ROB_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ROB_writes")==0) {sys.core[i].ROB_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"rename_reads")==0) {sys.core[i].rename_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"rename_writes")==0) {sys.core[i].rename_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_rename_reads")==0) {sys.core[i].fp_rename_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_rename_writes")==0) {sys.core[i].fp_rename_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_reads")==0) {sys.core[i].inst_window_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_writes")==0) {sys.core[i].inst_window_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_wakeup_accesses")==0) {sys.core[i].inst_window_wakeup_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_selections")==0) {sys.core[i].inst_window_selections=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_inst_window_reads")==0) {sys.core[i].fp_inst_window_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_inst_window_writes")==0) {sys.core[i].fp_inst_window_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_inst_window_wakeup_accesses")==0) {sys.core[i].fp_inst_window_wakeup_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_int_regfile_reads")==0) {sys.core[i].archi_int_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_float_regfile_reads")==0) {sys.core[i].archi_float_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_int_regfile_reads")==0) {sys.core[i].phy_int_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_float_regfile_reads")==0) {sys.core[i].phy_float_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_int_regfile_writes")==0) {sys.core[i].archi_int_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_float_regfile_writes")==0) {sys.core[i].archi_float_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_int_regfile_writes")==0) {sys.core[i].phy_int_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_float_regfile_writes")==0) {sys.core[i].phy_float_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"int_regfile_reads")==0) {sys.core[i].int_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"float_regfile_reads")==0) {sys.core[i].float_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"int_regfile_writes")==0) {sys.core[i].int_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"float_regfile_writes")==0) {sys.core[i].float_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"windowed_reg_accesses")==0) {sys.core[i].windowed_reg_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"windowed_reg_transports")==0) {sys.core[i].windowed_reg_transports=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"function_calls")==0) {sys.core[i].function_calls=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"context_switches")==0) {sys.core[i].context_switches=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ialu_accesses")==0) {sys.core[i].ialu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fpu_accesses")==0) {sys.core[i].fpu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"mul_accesses")==0) {sys.core[i].mul_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"cdb_alu_accesses")==0) {sys.core[i].cdb_alu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"cdb_mul_accesses")==0) {sys.core[i].cdb_mul_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"cdb_fpu_accesses")==0) {sys.core[i].cdb_fpu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_buffer_reads")==0) {sys.core[i].load_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_buffer_writes")==0) {sys.core[i].load_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_buffer_cams")==0) {sys.core[i].load_buffer_cams=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_reads")==0) {sys.core[i].store_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_writes")==0) {sys.core[i].store_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_cams")==0) {sys.core[i].store_buffer_cams=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_forwards")==0) {sys.core[i].store_buffer_forwards=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"main_memory_access")==0) {sys.core[i].main_memory_access=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"main_memory_read")==0) {sys.core[i].main_memory_read=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"main_memory_write")==0) {sys.core[i].main_memory_write=atoi(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"pipeline_duty_cycle")==0) {sys.core[i].pipeline_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"IFU_duty_cycle")==0) {sys.core[i].IFU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"BR_duty_cycle")==0) {sys.core[i].BR_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"LSU_duty_cycle")==0) {sys.core[i].LSU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MemManU_I_duty_cycle")==0) {sys.core[i].MemManU_I_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MemManU_D_duty_cycle")==0) {sys.core[i].MemManU_D_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ALU_duty_cycle")==0) {sys.core[i].ALU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MUL_duty_cycle")==0) {sys.core[i].MUL_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"FPU_duty_cycle")==0) {sys.core[i].FPU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ALU_cdb_duty_cycle")==0) {sys.core[i].ALU_cdb_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MUL_cdb_duty_cycle")==0) {sys.core[i].MUL_cdb_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"FPU_cdb_duty_cycle")==0) {sys.core[i].FPU_cdb_duty_cycle=atoi(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                }
+                                        }
+
+                                        NumofCom_4=xNode3.nChildNode("component"); //get the number of components within the third layer
+                                        for(j=0; j<NumofCom_4; j++)
+                                        {
+                                                xNode4=xNode3.getChildNode("component",j);
+                                                if (strcmp(xNode4.getAttribute("name"),"PBT")==0)
+                                                { //find PBT
+                                                        itmp=xNode4.nChildNode("param");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of param in system.core0.predictor--PBT
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"prediction_width")==0) {sys.core[i].predictor.prediction_width=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"prediction_scheme")==0) {strcpy(sys.core[i].predictor.prediction_scheme,xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"predictor_size")==0) {sys.core[i].predictor.predictor_size=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"predictor_entries")==0) {sys.core[i].predictor.predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"local_predictor_size")==0)
+                                                                {
+                                                                        strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                        m=0;
+                                                                        for(n=0; n<strtmp.length(); n++)
+                                                                        {
+                                                                                if (strtmp[n]!=',')
+                                                                                {
+                                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                                        strcat(chtmp1,chtmp);
+                                                                                }
+                                                                                else{
+                                                                                        sys.core[i].predictor.local_predictor_size[m]=atoi(chtmp1);
+                                                                                        m++;
+                                                                                        chtmp1[0]='\0';
+                                                                                }
+                                                                        }
+                                                                        sys.core[i].predictor.local_predictor_size[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                        continue;
+                                                                }
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"local_predictor_entries")==0) {sys.core[i].predictor.local_predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"global_predictor_entries")==0) {sys.core[i].predictor.global_predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"global_predictor_bits")==0) {sys.core[i].predictor.global_predictor_bits=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"chooser_predictor_entries")==0) {sys.core[i].predictor.chooser_predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"chooser_predictor_bits")==0) {sys.core[i].predictor.chooser_predictor_bits=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        }
+                                                        itmp=xNode4.nChildNode("stat");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of stat in system.core0.predictor--PBT
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"predictor_accesses")==0) sys.core[i].predictor.predictor_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));
+                                                        }
+                                                }
+                                                if (strcmp(xNode4.getAttribute("name"),"itlb")==0)
+                                                {//find system.core0.itlb
+                                                        itmp=xNode4.nChildNode("param");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of param in system.core0.itlb--itlb
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_entries")==0) sys.core[i].itlb.number_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                        }
+                                                        itmp=xNode4.nChildNode("stat");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of stat in itlb
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].itlb.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].itlb.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].itlb.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].itlb.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        }
+                                                }
+                                                if (strcmp(xNode4.getAttribute("name"),"icache")==0)
+                                                {//find system.core0.icache
+                                                        itmp=xNode4.nChildNode("param");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of param in system.core0.icache--icache
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"icache_config")==0)
+                                                                {
+                                                                        strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                        m=0;
+                                                                        for(n=0; n<strtmp.length(); n++)
+                                                                        {
+                                                                                if (strtmp[n]!=',')
+                                                                                {
+                                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                                        strcat(chtmp1,chtmp);
+                                                                                }
+                                                                                else{
+                                                                                        sys.core[i].icache.icache_config[m]=atof(chtmp1);
+                                                                                        m++;
+                                                                                        chtmp1[0]='\0';
+                                                                                }
+                                                                        }
+                                                                        sys.core[i].icache.icache_config[m]=atof(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                        continue;
+                                                                }
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0)
+                                                                {
+                                                                        strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                        m=0;
+                                                                        for(n=0; n<strtmp.length(); n++)
+                                                                        {
+                                                                                if (strtmp[n]!=',')
+                                                                                {
+                                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                                        strcat(chtmp1,chtmp);
+                                                                                }
+                                                                                else{
+                                                                                        sys.core[i].icache.buffer_sizes[m]=atoi(chtmp1);
+                                                                                        m++;
+                                                                                        chtmp1[0]='\0';
+                                                                                }
+                                                                        }
+                                                                        sys.core[i].icache.buffer_sizes[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        itmp=xNode4.nChildNode("stat");
+                                                        for(k=0; k<itmp; k++)
+                                                        {
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].icache.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].icache.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].icache.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.core[i].icache.replacements=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].icache.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].icache.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].icache.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"miss_buffer_access")==0) {sys.core[i].icache.miss_buffer_access=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.core[i].icache.fill_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.core[i].icache.prefetch_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.core[i].icache.prefetch_buffer_writes=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.core[i].icache.prefetch_buffer_reads=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.core[i].icache.prefetch_buffer_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].icache.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        }
+                                                }
+                                                if (strcmp(xNode4.getAttribute("name"),"dtlb")==0)
+                                                {//find system.core0.dtlb
+                                                        itmp=xNode4.nChildNode("param");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of param in system.core0.dtlb--dtlb
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_entries")==0) sys.core[i].dtlb.number_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                        }
+                                                        itmp=xNode4.nChildNode("stat");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of stat in dtlb
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].dtlb.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].dtlb.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.core[i].dtlb.write_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].dtlb.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.core[i].dtlb.write_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].dtlb.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.core[i].dtlb.write_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].dtlb.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].dtlb.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].dtlb.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                        }
+                                                }
+                                                if (strcmp(xNode4.getAttribute("name"),"dcache")==0)
+                                                {//find system.core0.dcache
+                                                        itmp=xNode4.nChildNode("param");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of param in system.core0.dcache--dcache
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"dcache_config")==0)
+                                                                {
+                                                                        strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                        m=0;
+                                                                        for(n=0; n<strtmp.length(); n++)
+                                                                        {
+                                                                                if (strtmp[n]!=',')
+                                                                                {
+                                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                                        strcat(chtmp1,chtmp);
+                                                                                }
+                                                                                else{
+                                                                                        sys.core[i].dcache.dcache_config[m]=atof(chtmp1);
+                                                                                        m++;
+                                                                                        chtmp1[0]='\0';
+                                                                                }
+                                                                        }
+                                                                        sys.core[i].dcache.dcache_config[m]=atof(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                        continue;
+                                                                }
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0)
+                                                                {
+                                                                        strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                        m=0;
+                                                                        for(n=0; n<strtmp.length(); n++)
+                                                                        {
+                                                                                if (strtmp[n]!=',')
+                                                                                {
+                                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                                        strcat(chtmp1,chtmp);
+                                                                                }
+                                                                                else{
+                                                                                        sys.core[i].dcache.buffer_sizes[m]=atoi(chtmp1);
+                                                                                        m++;
+                                                                                        chtmp1[0]='\0';
+                                                                                }
+                                                                        }
+                                                                        sys.core[i].dcache.buffer_sizes[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        itmp=xNode4.nChildNode("stat");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of stat in dcache
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].dcache.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].dcache.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.core[i].dcache.write_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].dcache.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].dcache.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].dcache.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.core[i].dcache.write_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].dcache.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.core[i].dcache.write_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.core[i].dcache.replacements=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_backs")==0) {sys.core[i].dcache.write_backs=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"miss_buffer_access")==0) {sys.core[i].dcache.miss_buffer_access=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.core[i].dcache.fill_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.core[i].dcache.prefetch_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.core[i].dcache.prefetch_buffer_writes=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.core[i].dcache.prefetch_buffer_reads=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.core[i].dcache.prefetch_buffer_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"wbb_writes")==0) {sys.core[i].dcache.wbb_writes=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"wbb_reads")==0) {sys.core[i].dcache.wbb_reads=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].dcache.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                        }
+                                                }
+                                                if (strcmp(xNode4.getAttribute("name"),"BTB")==0)
+                                                {//find system.core0.BTB
+                                                        itmp=xNode4.nChildNode("param");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of param in system.core0.BTB--BTB
+                                                                if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"BTB_config")==0)
+                                                                {
+                                                                        strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                        m=0;
+                                                                        for(n=0; n<strtmp.length(); n++)
+                                                                        {
+                                                                                if (strtmp[n]!=',')
+                                                                                {
+                                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                                        strcat(chtmp1,chtmp);
+                                                                                }
+                                                                                else{
+                                                                                        sys.core[i].BTB.BTB_config[m]=atoi(chtmp1);
+                                                                                        m++;
+                                                                                        chtmp1[0]='\0';
+                                                                                }
+                                                                        }
+                                                                        sys.core[i].BTB.BTB_config[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        itmp=xNode4.nChildNode("stat");
+                                                        for(k=0; k<itmp; k++)
+                                                        { //get all items of stat in BTB
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].BTB.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].BTB.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.core[i].BTB.write_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].BTB.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].BTB.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].BTB.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.core[i].BTB.write_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].BTB.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.core[i].BTB.write_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                                if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.core[i].BTB.replacements=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        }
+                                                }
+                                        }
+                                }
+                                else {
+                                        printf("The value of homogeneous_cores or number_of_cores is not correct!");
+                                        exit(0);
+                                }
+                        }
+                }
+
+                //__________________________________________Get system.L1Directory0-n____________________________________________
+                int w,tmpOrderofComponents_3layer;
+                w=OrderofComponents_3layer+1;
+                tmpOrderofComponents_3layer=OrderofComponents_3layer;
+                if (sys.homogeneous_L1Directories==1) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L1Directories;
+
+                for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++)
+                {
+                        xNode3=xNode2.getChildNode("component",w);
+                        if (xNode3.isEmpty()==1) {
+                                printf("The value of homogeneous_L1Directories or number_of_L1Directories is not correct!");
+                                exit(0);
+                        }
+                        else
+                        {
+                                if (strstr(xNode3.getAttribute("id"),"L1Directory")!=NULL)
+                                {
+                                        itmp=xNode3.nChildNode("param");
+                                        for(k=0; k<itmp; k++)
+                                        { //get all items of param in system.L1Directory
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Dir_config")==0)
+                                                {
+                                                        strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                        m=0;
+                                                        for(n=0; n<strtmp.length(); n++)
+                                                        {
+                                                                if (strtmp[n]!=',')
+                                                                {
+                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                        strcat(chtmp1,chtmp);
+                                                                }
+                                                                else{
+                                                                        sys.L1Directory[i].Dir_config[m]=atof(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        sys.L1Directory[i].Dir_config[m]=atof(chtmp1);
+                                                        m++;
+                                                        chtmp1[0]='\0';
+                                                        continue;
+                                                }
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0)
+                                                {
+                                                        strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                        m=0;
+                                                        for(n=0; n<strtmp.length(); n++)
+                                                        {
+                                                                if (strtmp[n]!=',')
+                                                                {
+                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                        strcat(chtmp1,chtmp);
+                                                                }
+                                                                else{
+                                                                        sys.L1Directory[i].buffer_sizes[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        sys.L1Directory[i].buffer_sizes[m]=atoi(chtmp1);
+                                                        m++;
+                                                        chtmp1[0]='\0';
+                                                        continue;
+                                                }
+
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L1Directory[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0)
+                                                {
+                                                        strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                        m=0;
+                                                        for(n=0; n<strtmp.length(); n++)
+                                                        {
+                                                                if (strtmp[n]!=',')
+                                                                {
+                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                        strcat(chtmp1,chtmp);
+                                                                }
+                                                                else{
+                                                                        sys.L1Directory[i].ports[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        sys.L1Directory[i].ports[m]=atoi(chtmp1);
+                                                        m++;
+                                                        chtmp1[0]='\0';
+                                                        continue;
+                                                }
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L1Directory[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Directory_type")==0) {sys.L1Directory[i].Directory_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"3D_stack")==0) {strcpy(sys.L1Directory[i].threeD_stack,xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                        }
+                                        itmp=xNode3.nChildNode("stat");
+                                        for(k=0; k<itmp; k++)
+                                        { //get all items of stat in system.L2directorydirectory
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L1Directory[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L1Directory[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L1Directory[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L1Directory[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L1Directory[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L1Directory[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L1Directory[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                        }
+                                        w=w+1;
+                                }
+                                else {
+                                        printf("The value of homogeneous_L1Directories or number_of_L1Directories is not correct!");
+                                        exit(0);
+                                }
+                        }
+                }
+
+                //__________________________________________Get system.L2Directory0-n____________________________________________
+                w=OrderofComponents_3layer+1;
+                tmpOrderofComponents_3layer=OrderofComponents_3layer;
+                if (sys.homogeneous_L2Directories==1) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L2Directories;
+
+                for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++)
+                {
+                        xNode3=xNode2.getChildNode("component",w);
+                        if (xNode3.isEmpty()==1) {
+                                printf("The value of homogeneous_L2Directories or number_of_L2Directories is not correct!");
+                                exit(0);
+                        }
+                        else
+                        {
+                                if (strstr(xNode3.getAttribute("id"),"L2Directory")!=NULL)
+                                {
+                                        itmp=xNode3.nChildNode("param");
+                                        for(k=0; k<itmp; k++)
+                                        { //get all items of param in system.L2Directory
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Dir_config")==0)
+                                                {
+                                                        strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                        m=0;
+                                                        for(n=0; n<strtmp.length(); n++)
+                                                        {
+                                                                if (strtmp[n]!=',')
+                                                                {
+                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                        strcat(chtmp1,chtmp);
+                                                                }
+                                                                else{
+                                                                        sys.L2Directory[i].Dir_config[m]=atof(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        sys.L2Directory[i].Dir_config[m]=atof(chtmp1);
+                                                        m++;
+                                                        chtmp1[0]='\0';
+                                                        continue;
+                                                }
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0)
+                                                {
+                                                        strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                        m=0;
+                                                        for(n=0; n<strtmp.length(); n++)
+                                                        {
+                                                                if (strtmp[n]!=',')
+                                                                {
+                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                        strcat(chtmp1,chtmp);
+                                                                }
+                                                                else{
+                                                                        sys.L2Directory[i].buffer_sizes[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        sys.L2Directory[i].buffer_sizes[m]=atoi(chtmp1);
+                                                        m++;
+                                                        chtmp1[0]='\0';
+                                                        continue;
+                                                }
+
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L2Directory[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Directory_type")==0) {sys.L2Directory[i].Directory_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0)
+                                                {
+                                                        strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                        m=0;
+                                                        for(n=0; n<strtmp.length(); n++)
+                                                        {
+                                                                if (strtmp[n]!=',')
+                                                                {
+                                                                        sprintf(chtmp,"%c",strtmp[n]);
+                                                                        strcat(chtmp1,chtmp);
+                                                                }
+                                                                else{
+                                                                        sys.L2Directory[i].ports[m]=atoi(chtmp1);
+                                                                        m++;
+                                                                        chtmp1[0]='\0';
+                                                                }
+                                                        }
+                                                        sys.L2Directory[i].ports[m]=atoi(chtmp1);
+                                                        m++;
+                                                        chtmp1[0]='\0';
+                                                        continue;
+                                                }
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L2Directory[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"3D_stack")==0) {strcpy(sys.L2Directory[i].threeD_stack,xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                        }
+                                        itmp=xNode3.nChildNode("stat");
+                                        for(k=0; k<itmp; k++)
+                                        { //get all items of stat in system.L2directorydirectory
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L2Directory[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L2Directory[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L2Directory[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L2Directory[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L2Directory[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L2Directory[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L2Directory[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                        }
+                                        w=w+1;
+                                }
+                                else {
+                                        printf("The value of homogeneous_L2Directories or number_of_L2Directories is not correct!");
+                                        exit(0);
+                                }
+                        }
+                }
+
+                //__________________________________________Get system.L2[0..n]____________________________________________
+                w=OrderofComponents_3layer+1;
+                tmpOrderofComponents_3layer=OrderofComponents_3layer;
+                if (sys.homogeneous_L2s==1) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L2s;
+
+                for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++)
+                {
+                        xNode3=xNode2.getChildNode("component",w);
+                        if (xNode3.isEmpty()==1) {
+                                printf("The value of homogeneous_L2s or number_of_L2s is not correct!");
+                                exit(0);
+                        }
+                        else
+                        {
+                                if (strstr(xNode3.getAttribute("name"),"L2")!=NULL)
+                                {
+                                        { //For L20-L2i
+                                                //Get all params with system.L2?
+                                                itmp=xNode3.nChildNode("param");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"L2_config")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.L2[i].L2_config[m]=atof(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.L2[i].L2_config[m]=atof(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L2[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"merged_dir")==0) {sys.L2[i].merged_dir=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.L2[i].ports[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.L2[i].ports[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L2[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"threeD_stack")==0) {strcpy(sys.L2[i].threeD_stack,(xNode3.getChildNode("param",k).getAttribute("value")));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.L2[i].buffer_sizes[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.L2[i].buffer_sizes[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                }
+                                                //Get all stats with system.L2?
+                                                itmp=xNode3.nChildNode("stat");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L2[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L2[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L2[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.L2[i].total_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.L2[i].total_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.L2[i].read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.L2[i].write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L2[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L2[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.L2[i].replacements=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_backs")==0) {sys.L2[i].write_backs=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"miss_buffer_accesses")==0) {sys.L2[i].miss_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.L2[i].fill_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.L2[i].prefetch_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.L2[i].prefetch_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.L2[i].prefetch_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.L2[i].prefetch_buffer_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_writes")==0) {sys.L2[i].wbb_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_reads")==0) {sys.L2[i].wbb_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L2[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L2[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L2[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L2[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_hits")==0) {sys.L2[i].homenode_read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_hits")==0) {sys.L2[i].homenode_write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_misses")==0) {sys.L2[i].homenode_read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_misses")==0) {sys.L2[i].homenode_write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"dir_duty_cycle")==0) {sys.L2[i].dir_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                }
+                                        }
+                                        w=w+1;
+                                }
+                                else {
+                                        printf("The value of homogeneous_L2s or number_of_L2s is not correct!");
+                                        exit(0);
+                                }
+                        }
+                }
+                //__________________________________________Get system.L3[0..n]____________________________________________
+                w=OrderofComponents_3layer+1;
+                tmpOrderofComponents_3layer=OrderofComponents_3layer;
+                if (sys.homogeneous_L3s==1) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L3s;
+
+                for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++)
+                {
+                        xNode3=xNode2.getChildNode("component",w);
+                        if (xNode3.isEmpty()==1) {
+                                printf("The value of homogeneous_L3s or number_of_L3s is not correct!");
+                                exit(0);
+                        }
+                        else
+                        {
+                                if (strstr(xNode3.getAttribute("name"),"L3")!=NULL)
+                                {
+                                        { //For L30-L3i
+                                                //Get all params with system.L3?
+                                                itmp=xNode3.nChildNode("param");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"L3_config")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.L3[i].L3_config[m]=atof(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.L3[i].L3_config[m]=atof(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L3[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"merged_dir")==0) {sys.L3[i].merged_dir=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.L3[i].ports[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.L3[i].ports[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L3[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"threeD_stack")==0) {strcpy(sys.L3[i].threeD_stack,(xNode3.getChildNode("param",k).getAttribute("value")));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.L3[i].buffer_sizes[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.L3[i].buffer_sizes[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                }
+                                                //Get all stats with system.L3?
+                                                itmp=xNode3.nChildNode("stat");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L3[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L3[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L3[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.L3[i].total_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.L3[i].total_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.L3[i].read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.L3[i].write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L3[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L3[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.L3[i].replacements=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_backs")==0) {sys.L3[i].write_backs=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"miss_buffer_accesses")==0) {sys.L3[i].miss_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.L3[i].fill_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.L3[i].prefetch_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.L3[i].prefetch_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.L3[i].prefetch_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.L3[i].prefetch_buffer_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_writes")==0) {sys.L3[i].wbb_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_reads")==0) {sys.L3[i].wbb_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L3[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L3[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L3[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L3[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_hits")==0) {sys.L3[i].homenode_read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_hits")==0) {sys.L3[i].homenode_write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_misses")==0) {sys.L3[i].homenode_read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_misses")==0) {sys.L3[i].homenode_write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"dir_duty_cycle")==0) {sys.L3[i].dir_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                                                }
+                                        }
+                                        w=w+1;
+                                }
+                                else {
+                                        printf("The value of homogeneous_L3s or number_of_L3s is not correct!");
+                                        exit(0);
+                                }
+                        }
+                }
+                //__________________________________________Get system.NoC[0..n]____________________________________________
+                w=OrderofComponents_3layer+1;
+                tmpOrderofComponents_3layer=OrderofComponents_3layer;
+                if (sys.homogeneous_NoCs==1) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_NoCs;
+
+                for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++)
+                {
+                        xNode3=xNode2.getChildNode("component",w);
+                        if (xNode3.isEmpty()==1) {
+                                printf("The value of homogeneous_NoCs or number_of_NoCs is not correct!");
+                                exit(0);
+                        }
+                        else
+                        {
+                                if (strstr(xNode3.getAttribute("name"),"noc")!=NULL)
+                                {
+                                        { //For NoC0-NoCi
+                                                //Get all params with system.NoC?
+                                                itmp=xNode3.nChildNode("param");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.NoC[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.NoC[i].type=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"topology")==0) {strcpy(sys.NoC[i].topology,(xNode3.getChildNode("param",k).getAttribute("value")));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"horizontal_nodes")==0) {sys.NoC[i].horizontal_nodes=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"vertical_nodes")==0) {sys.NoC[i].vertical_nodes=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"has_global_link")==0) {sys.NoC[i].has_global_link=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"link_throughput")==0) {sys.NoC[i].link_throughput=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"link_latency")==0) {sys.NoC[i].link_latency=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"input_ports")==0) {sys.NoC[i].input_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"output_ports")==0) {sys.NoC[i].output_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"virtual_channel_per_port")==0) {sys.NoC[i].virtual_channel_per_port=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"flit_bits")==0) {sys.NoC[i].flit_bits=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"input_buffer_entries_per_vc")==0) {sys.NoC[i].input_buffer_entries_per_vc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"dual_pump")==0) {sys.NoC[i].dual_pump=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"chip_coverage")==0) {sys.NoC[i].chip_coverage=atof(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"link_routing_over_percentage")==0) {sys.NoC[i].route_over_perc=atof(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports_of_input_buffer")==0)
+                                                        {
+                                                                strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value"));
+                                                                m=0;
+                                                                for(n=0; n<strtmp.length(); n++)
+                                                                {
+                                                                        if (strtmp[n]!=',')
+                                                                        {
+                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                strcat(chtmp1,chtmp);
+                                                                        }
+                                                                        else{
+                                                                                sys.NoC[i].ports_of_input_buffer[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                sys.NoC[i].ports_of_input_buffer[m]=atoi(chtmp1);
+                                                                m++;
+                                                                chtmp1[0]='\0';
+                                                                continue;
+                                                        }
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_of_crossbars")==0) {sys.NoC[i].number_of_crossbars=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"crossbar_type")==0) {strcpy(sys.NoC[i].crossbar_type,(xNode3.getChildNode("param",k).getAttribute("value")));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"crosspoint_type")==0) {strcpy(sys.NoC[i].crosspoint_type,(xNode3.getChildNode("param",k).getAttribute("value")));continue;}
+                                                        if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"arbiter_type")==0) {sys.NoC[i].arbiter_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                                }
+                                                NumofCom_4=xNode3.nChildNode("component"); //get the number of components within the third layer
+                                                for(j=0; j<NumofCom_4; j++)
+                                                {
+                                                        xNode4=xNode3.getChildNode("component",j);
+                                                        if (strcmp(xNode4.getAttribute("name"),"xbar0")==0)
+                                                        { //find PBT
+                                                                itmp=xNode4.nChildNode("param");
+                                                                for(k=0; k<itmp; k++)
+                                                                { //get all items of param in system.XoC0.xbar0--xbar0
+                                                                        if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_of_inputs_of_crossbars")==0) {sys.NoC[i].xbar0.number_of_inputs_of_crossbars=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                        if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_of_outputs_of_crossbars")==0) {sys.NoC[i].xbar0.number_of_outputs_of_crossbars=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                        if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"flit_bits")==0) {sys.NoC[i].xbar0.flit_bits=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                        if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"input_buffer_entries_per_port")==0) {sys.NoC[i].xbar0.input_buffer_entries_per_port=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;}
+                                                                        if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"ports_of_input_buffer")==0)
+                                                                        {
+                                                                                strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value"));
+                                                                                m=0;
+                                                                                for(n=0; n<strtmp.length(); n++)
+                                                                                {
+                                                                                        if (strtmp[n]!=',')
+                                                                                        {
+                                                                                                sprintf(chtmp,"%c",strtmp[n]);
+                                                                                                strcat(chtmp1,chtmp);
+                                                                                        }
+                                                                                        else{
+                                                                                                sys.NoC[i].xbar0.ports_of_input_buffer[m]=atoi(chtmp1);
+                                                                                                m++;
+                                                                                                chtmp1[0]='\0';
+                                                                                        }
+                                                                                }
+                                                                                sys.NoC[i].xbar0.ports_of_input_buffer[m]=atoi(chtmp1);
+                                                                                m++;
+                                                                                chtmp1[0]='\0';
+                                                                        }
+                                                                }
+                                                                itmp=xNode4.nChildNode("stat");
+                                                                for(k=0; k<itmp; k++)
+                                                                { //get all items of stat in system.core0.predictor--PBT
+                                                                        if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"predictor_accesses")==0) sys.core[i].predictor.predictor_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));
+                                                                }
+                                                        }
+                                                }
+                                                //Get all stats with system.NoC?
+                                                itmp=xNode3.nChildNode("stat");
+                                                for(k=0; k<itmp; k++)
+                                                {
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) sys.NoC[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));
+                                                        if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) sys.NoC[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));
+                                                }
+                                        }
+                                        w=w+1;
+                                }
+                        }
+                }
+                //__________________________________________Get system.mem____________________________________________
+                if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                xNode3=xNode2.getChildNode("component",OrderofComponents_3layer);
+                if (xNode3.isEmpty()==1) {
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                if (strstr(xNode3.getAttribute("id"),"system.mem")!=NULL)
+                {
+
+                        itmp=xNode3.nChildNode("param");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of param in system.mem
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"mem_tech_node")==0) {sys.mem.mem_tech_node=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_clock")==0) {sys.mem.device_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_transfer_rate")==0) {sys.mem.peak_transfer_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"capacity_per_channel")==0) {sys.mem.capacity_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0) {sys.mem.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"num_banks_of_DRAM_chip")==0) {sys.mem.num_banks_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Block_width_of_DRAM_chip")==0) {sys.mem.Block_width_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"output_width_of_DRAM_chip")==0) {sys.mem.output_width_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"page_size_of_DRAM_chip")==0) {sys.mem.page_size_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"burstlength_of_DRAM_chip")==0) {sys.mem.burstlength_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"internal_prefetch_of_DRAM_chip")==0) {sys.mem.internal_prefetch_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                        }
+                        itmp=xNode3.nChildNode("stat");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of stat in system.mem
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0) {sys.mem.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0) {sys.mem.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0) {sys.mem.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                        }
+                }
+                else{
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                //__________________________________________Get system.mc____________________________________________
+                if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                xNode3=xNode2.getChildNode("component",OrderofComponents_3layer);
+                if (xNode3.isEmpty()==1) {
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                if (strstr(xNode3.getAttribute("id"),"system.mc")!=NULL)
+                {
+                        itmp=xNode3.nChildNode("param");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of param in system.mem
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"mc_clock")==0) {sys.mc.mc_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"block_size")==0) {sys.mc.llc_line_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_mcs")==0) {sys.mc.number_mcs=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_channels_per_mc")==0) {sys.mc.memory_channels_per_mc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"req_window_size_per_channel")==0) {sys.mc.req_window_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"IO_buffer_size_per_channel")==0) {sys.mc.IO_buffer_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"databus_width")==0) {sys.mc.databus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"addressbus_width")==0) {sys.mc.addressbus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_transfer_rate")==0) {sys.mc.peak_transfer_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0) {sys.mc.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LVDS")==0) {sys.mc.LVDS=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.mc.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"withPHY")==0) {sys.mc.withPHY=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+
+                        }
+                        itmp=xNode3.nChildNode("stat");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of stat in system.mendirectory
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0) {sys.mc.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0) {sys.mc.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0) {sys.mc.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                        }
+                }
+                else{
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                //__________________________________________Get system.niu____________________________________________
+                if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                xNode3=xNode2.getChildNode("component",OrderofComponents_3layer);
+                if (xNode3.isEmpty()==1) {
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                if (strstr(xNode3.getAttribute("id"),"system.niu")!=NULL)
+                {
+                        itmp=xNode3.nChildNode("param");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of param in system.mem
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.niu.clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_units")==0) {sys.niu.number_units=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.niu.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                        }
+                        itmp=xNode3.nChildNode("stat");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of stat in system.mendirectory
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.niu.duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_load_perc")==0) {sys.niu.total_load_perc=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                        }
+                }
+                else{
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+
+                //__________________________________________Get system.pcie____________________________________________
+                if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                xNode3=xNode2.getChildNode("component",OrderofComponents_3layer);
+                if (xNode3.isEmpty()==1) {
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                if (strstr(xNode3.getAttribute("id"),"system.pcie")!=NULL)
+                {
+                        itmp=xNode3.nChildNode("param");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of param in system.mem
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.pcie.clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_units")==0) {sys.pcie.number_units=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"num_channels")==0) {sys.pcie.num_channels=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.pcie.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"withPHY")==0) {sys.pcie.withPHY=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+
+                        }
+                        itmp=xNode3.nChildNode("stat");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of stat in system.mendirectory
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.pcie.duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_load_perc")==0) {sys.pcie.total_load_perc=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                        }
+                }
+                else{
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                //__________________________________________Get system.flashcontroller____________________________________________
+                if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1;
+                xNode3=xNode2.getChildNode("component",OrderofComponents_3layer);
+                if (xNode3.isEmpty()==1) {
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+                if (strstr(xNode3.getAttribute("id"),"system.flashc")!=NULL)
+                {
+                        itmp=xNode3.nChildNode("param");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of param in system.mem
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"flashc_clock")==0) {sys.flashc.mc_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"block_size")==0) {sys.flashc.llc_line_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_flashcs")==0) {sys.flashc.number_mcs=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_channels_per_flashc")==0) {sys.flashc.memory_channels_per_mc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"req_window_size_per_channel")==0) {sys.flashc.req_window_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"IO_buffer_size_per_channel")==0) {sys.flashc.IO_buffer_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"databus_width")==0) {sys.flashc.databus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"addressbus_width")==0) {sys.flashc.addressbus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_transfer_rate")==0) {sys.flashc.peak_transfer_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0) {sys.flashc.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LVDS")==0) {sys.flashc.LVDS=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.flashc.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"withPHY")==0) {sys.flashc.withPHY=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+
+                        }
+                        itmp=xNode3.nChildNode("stat");
+                        for(k=0; k<itmp; k++)
+                        { //get all items of stat in system.mendirectory
+//                             if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0) {sys.flashc.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0) {sys.flashc.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+//                             if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0) {sys.flashc.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.flashc.duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+                                if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_load_perc")==0) {sys.flashc.total_load_perc=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+
+                        }
+                }
+                else{
+                        printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!");
+                        exit(0);
+                }
+
+        }
+}
+void ParseXML::initialize() //Initialize all
+{
+        //All number_of_* at the level of 'system' 03/21/2009
+        sys.number_of_cores=1;
+        sys.number_of_L1Directories=1;
+        sys.number_of_L2Directories=1;
+        sys.number_of_L2s=1;
+        sys.Private_L2 = false;
+        sys.number_of_L3s=1;
+        sys.number_of_NoCs=1;
+        // All params at the level of 'system'
+        //strcpy(sys.homogeneous_cores,"default");
+        sys.core_tech_node=1;
+        sys.target_core_clockrate=1;
+        sys.target_chip_area=1;
+        sys.temperature=1;
+        sys.number_cache_levels=1;
+        sys.homogeneous_cores=1;
+        sys.homogeneous_L1Directories=1;
+        sys.homogeneous_L2Directories=1;
+        sys.homogeneous_L2s=1;
+        sys.homogeneous_L3s=1;
+        sys.homogeneous_NoCs=1;
+        sys.homogeneous_ccs=1;
+
+        sys.Max_area_deviation=1;
+        sys.Max_power_deviation=1;
+        sys.device_type=1;
+        sys.longer_channel_device =true;
+        sys.Embedded =false;
+        sys.opt_dynamic_power=false;
+        sys.opt_lakage_power=false;
+        sys.opt_clockrate=true;
+        sys.opt_area=false;
+        sys.interconnect_projection_type=1;
+        int i,j;
+        for (i=0; i<=63; i++)
+        {
+                sys.core[i].clock_rate=1;
+                sys.core[i].opt_local = true;
+                sys.core[i].x86 = false;
+                sys.core[i].machine_bits=1;
+                sys.core[i].virtual_address_width=1;
+                sys.core[i].physical_address_width=1;
+                sys.core[i].opcode_width=1;
+                sys.core[i].micro_opcode_width=1;
+                //strcpy(sys.core[i].machine_type,"default");
+                sys.core[i].internal_datapath_width=1;
+                sys.core[i].number_hardware_threads=1;
+                sys.core[i].fetch_width=1;
+                sys.core[i].number_instruction_fetch_ports=1;
+                sys.core[i].decode_width=1;
+                sys.core[i].issue_width=1;
+                sys.core[i].peak_issue_width=1;
+                sys.core[i].commit_width=1;
+                for (j=0; j<20; j++) sys.core[i].pipelines_per_core[j]=1;
+                for (j=0; j<20; j++) sys.core[i].pipeline_depth[j]=1;
+                strcpy(sys.core[i].FPU,"default");
+                strcpy(sys.core[i]. divider_multiplier,"default");
+                sys.core[i].ALU_per_core=1;
+                sys.core[i].FPU_per_core=1.0;
+                sys.core[i].MUL_per_core=1;
+                sys.core[i].instruction_buffer_size=1;
+                sys.core[i].decoded_stream_buffer_size=1;
+                //strcpy(sys.core[i].instruction_window_scheme,"default");
+                sys.core[i].instruction_window_size=1;
+                sys.core[i].ROB_size=1;
+                sys.core[i].archi_Regs_IRF_size=1;
+                sys.core[i].archi_Regs_FRF_size=1;
+                sys.core[i].phy_Regs_IRF_size=1;
+                sys.core[i].phy_Regs_FRF_size=1;
+                //strcpy(sys.core[i].rename_scheme,"default");
+                sys.core[i].register_windows_size=1;
+                strcpy(sys.core[i].LSU_order,"default");
+                sys.core[i].store_buffer_size=1;
+                sys.core[i].load_buffer_size=1;
+                sys.core[i].memory_ports=1;
+                strcpy(sys.core[i].Dcache_dual_pump,"default");
+                sys.core[i].RAS_size=1;
+                //all stats at the level of system.core(0-n)
+                sys.core[i].total_instructions=1;
+                sys.core[i].int_instructions=1;
+                sys.core[i].fp_instructions=1;
+                sys.core[i].branch_instructions=1;
+                sys.core[i].branch_mispredictions=1;
+                sys.core[i].committed_instructions=1;
+                sys.core[i].load_instructions=1;
+                sys.core[i].store_instructions=1;
+                sys.core[i].total_cycles=1;
+                sys.core[i].idle_cycles=1;
+                sys.core[i].busy_cycles=1;
+                sys.core[i].instruction_buffer_reads=1;
+                sys.core[i].instruction_buffer_write=1;
+                sys.core[i].ROB_reads=1;
+                sys.core[i].ROB_writes=1;
+                sys.core[i].rename_accesses=1;
+                sys.core[i].inst_window_reads=1;
+                sys.core[i].inst_window_writes=1;
+                sys.core[i].inst_window_wakeup_accesses=1;
+                sys.core[i].inst_window_selections=1;
+                sys.core[i].archi_int_regfile_reads=1;
+                sys.core[i].archi_float_regfile_reads=1;
+                sys.core[i].phy_int_regfile_reads=1;
+                sys.core[i].phy_float_regfile_reads=1;
+                sys.core[i].windowed_reg_accesses=1;
+                sys.core[i].windowed_reg_transports=1;
+                sys.core[i].function_calls=1;
+                sys.core[i].ialu_accesses=1;
+                sys.core[i].fpu_accesses=1;
+                sys.core[i].mul_accesses=1;
+                sys.core[i].cdb_alu_accesses=1;
+                sys.core[i].cdb_mul_accesses=1;
+                sys.core[i].cdb_fpu_accesses=1;
+                sys.core[i].load_buffer_reads=1;
+                sys.core[i].load_buffer_writes=1;
+                sys.core[i].load_buffer_cams=1;
+                sys.core[i].store_buffer_reads=1;
+                sys.core[i].store_buffer_writes=1;
+                sys.core[i].store_buffer_cams=1;
+                sys.core[i].store_buffer_forwards=1;
+                sys.core[i].main_memory_access=1;
+                sys.core[i].main_memory_read=1;
+                sys.core[i].main_memory_write=1;
+                sys.core[i].IFU_duty_cycle = 1;
+                sys.core[i].BR_duty_cycle = 1;
+                sys.core[i].LSU_duty_cycle = 1;
+                sys.core[i].MemManU_I_duty_cycle =1;
+                sys.core[i].MemManU_D_duty_cycle =1;
+                sys.core[i].ALU_duty_cycle =1;
+                sys.core[i].MUL_duty_cycle =1;
+                sys.core[i].FPU_duty_cycle =1;
+                sys.core[i].ALU_cdb_duty_cycle =1;
+                sys.core[i].MUL_cdb_duty_cycle =1;
+                sys.core[i].FPU_cdb_duty_cycle =1;
+                //system.core?.predictor
+                sys.core[i].predictor.prediction_width=1;
+                strcpy(sys.core[i].predictor.prediction_scheme,"default");
+                sys.core[i].predictor.predictor_size=1;
+                sys.core[i].predictor.predictor_entries=1;
+                sys.core[i].predictor.local_predictor_entries=1;
+                for (j=0; j<20; j++) sys.core[i].predictor.local_predictor_size[j]=1;
+                sys.core[i].predictor.global_predictor_entries=1;
+                sys.core[i].predictor.global_predictor_bits=1;
+                sys.core[i].predictor.chooser_predictor_entries=1;
+                sys.core[i].predictor.chooser_predictor_bits=1;
+                sys.core[i].predictor.predictor_accesses=1;
+                //system.core?.itlb
+                sys.core[i].itlb.number_entries=1;
+                sys.core[i].itlb.total_hits=1;
+                sys.core[i].itlb.total_accesses=1;
+                sys.core[i].itlb.total_misses=1;
+                //system.core?.icache
+                for (j=0; j<20; j++) sys.core[i].icache.icache_config[j]=1;
+                //strcpy(sys.core[i].icache.buffer_sizes,"default");
+                sys.core[i].icache.total_accesses=1;
+                sys.core[i].icache.read_accesses=1;
+                sys.core[i].icache.read_misses=1;
+                sys.core[i].icache.replacements=1;
+                sys.core[i].icache.read_hits=1;
+                sys.core[i].icache.total_hits=1;
+                sys.core[i].icache.total_misses=1;
+                sys.core[i].icache.miss_buffer_access=1;
+                sys.core[i].icache.fill_buffer_accesses=1;
+                sys.core[i].icache.prefetch_buffer_accesses=1;
+                sys.core[i].icache.prefetch_buffer_writes=1;
+                sys.core[i].icache.prefetch_buffer_reads=1;
+                sys.core[i].icache.prefetch_buffer_hits=1;
+                //system.core?.dtlb
+                sys.core[i].dtlb.number_entries=1;
+                sys.core[i].dtlb.total_accesses=1;
+                sys.core[i].dtlb.read_accesses=1;
+                sys.core[i].dtlb.write_accesses=1;
+                sys.core[i].dtlb.write_hits=1;
+                sys.core[i].dtlb.read_hits=1;
+                sys.core[i].dtlb.read_misses=1;
+                sys.core[i].dtlb.write_misses=1;
+                sys.core[i].dtlb.total_hits=1;
+                sys.core[i].dtlb.total_misses=1;
+                //system.core?.dcache
+                for (j=0; j<20; j++) sys.core[i].dcache.dcache_config[j]=1;
+                //strcpy(sys.core[i].dcache.buffer_sizes,"default");
+                sys.core[i].dcache.total_accesses=1;
+                sys.core[i].dcache.read_accesses=1;
+                sys.core[i].dcache.write_accesses=1;
+                sys.core[i].dcache.total_hits=1;
+                sys.core[i].dcache.total_misses=1;
+                sys.core[i].dcache.read_hits=1;
+                sys.core[i].dcache.write_hits=1;
+                sys.core[i].dcache.read_misses=1;
+                sys.core[i].dcache.write_misses=1;
+                sys.core[i].dcache.replacements=1;
+                sys.core[i].dcache.write_backs=1;
+                sys.core[i].dcache.miss_buffer_access=1;
+                sys.core[i].dcache.fill_buffer_accesses=1;
+                sys.core[i].dcache.prefetch_buffer_accesses=1;
+                sys.core[i].dcache.prefetch_buffer_writes=1;
+                sys.core[i].dcache.prefetch_buffer_reads=1;
+                sys.core[i].dcache.prefetch_buffer_hits=1;
+                sys.core[i].dcache.wbb_writes=1;
+                sys.core[i].dcache.wbb_reads=1;
+                //system.core?.BTB
+                for (j=0; j<20; j++) sys.core[i].BTB.BTB_config[j]=1;
+                sys.core[i].BTB.total_accesses=1;
+                sys.core[i].BTB.read_accesses=1;
+                sys.core[i].BTB.write_accesses=1;
+                sys.core[i].BTB.total_hits=1;
+                sys.core[i].BTB.total_misses=1;
+                sys.core[i].BTB.read_hits=1;
+                sys.core[i].BTB.write_hits=1;
+                sys.core[i].BTB.read_misses=1;
+                sys.core[i].BTB.write_misses=1;
+                sys.core[i].BTB.replacements=1;
+        }
+
+        //system_L1directory
+        for (i=0; i<=63; i++)
+        {
+                for (j=0; j<20; j++) sys.L1Directory[i].Dir_config[j]=1;
+                for (j=0; j<20; j++) sys.L1Directory[i].buffer_sizes[j]=1;
+                sys.L1Directory[i].clockrate=1;
+                sys.L1Directory[i].ports[20]=1;
+                sys.L1Directory[i].device_type=1;
+                strcpy(sys.L1Directory[i].threeD_stack,"default");
+                sys.L1Directory[i].total_accesses=1;
+                sys.L1Directory[i].read_accesses=1;
+                sys.L1Directory[i].write_accesses=1;
+                sys.L1Directory[i].duty_cycle =1;
+        }
+        //system_L2directory
+        for (i=0; i<=63; i++)
+        {
+                for (j=0; j<20; j++) sys.L2Directory[i].Dir_config[j]=1;
+                for (j=0; j<20; j++) sys.L2Directory[i].buffer_sizes[j]=1;
+                sys.L2Directory[i].clockrate=1;
+                sys.L2Directory[i].ports[20]=1;
+                sys.L2Directory[i].device_type=1;
+                strcpy(sys.L2Directory[i].threeD_stack,"default");
+                sys.L2Directory[i].total_accesses=1;
+                sys.L2Directory[i].read_accesses=1;
+                sys.L2Directory[i].write_accesses=1;
+                sys.L2Directory[i].duty_cycle =1;
+        }
+        for (i=0; i<=63; i++)
+        {
+                //system_L2
+                for (j=0; j<20; j++) sys.L2[i].L2_config[j]=1;
+                sys.L2[i].clockrate=1;
+                for (j=0; j<20; j++) sys.L2[i].ports[j]=1;
+                sys.L2[i].device_type=1;
+                strcpy(sys.L2[i].threeD_stack,"default");
+                for (j=0; j<20; j++) sys.L2[i].buffer_sizes[j]=1;
+                sys.L2[i].total_accesses=1;
+                sys.L2[i].read_accesses=1;
+                sys.L2[i].write_accesses=1;
+                sys.L2[i].total_hits=1;
+                sys.L2[i].total_misses=1;
+                sys.L2[i].read_hits=1;
+                sys.L2[i].write_hits=1;
+                sys.L2[i].read_misses=1;
+                sys.L2[i].write_misses=1;
+                sys.L2[i].replacements=1;
+                sys.L2[i].write_backs=1;
+                sys.L2[i].miss_buffer_accesses=1;
+                sys.L2[i].fill_buffer_accesses=1;
+                sys.L2[i].prefetch_buffer_accesses=1;
+                sys.L2[i].prefetch_buffer_writes=1;
+                sys.L2[i].prefetch_buffer_reads=1;
+                sys.L2[i].prefetch_buffer_hits=1;
+                sys.L2[i].wbb_writes=1;
+                sys.L2[i].wbb_reads=1;
+                sys.L2[i].duty_cycle =1;
+                sys.L2[i].merged_dir=false;
+                sys.L2[i].homenode_read_accesses =1;
+                sys.L2[i].homenode_write_accesses=1;
+                sys.L2[i].homenode_read_hits=1;
+                sys.L2[i].homenode_write_hits=1;
+                sys.L2[i].homenode_read_misses=1;
+                sys.L2[i].homenode_write_misses=1;
+                sys.L2[i].dir_duty_cycle=1;
+        }
+        for (i=0; i<=63; i++)
+        {
+                //system_L3
+                for (j=0; j<20; j++) sys.L3[i].L3_config[j]=1;
+                sys.L3[i].clockrate=1;
+                for (j=0; j<20; j++) sys.L3[i].ports[j]=1;
+                sys.L3[i].device_type=1;
+                strcpy(sys.L3[i].threeD_stack,"default");
+                for (j=0; j<20; j++) sys.L3[i].buffer_sizes[j]=1;
+                sys.L3[i].total_accesses=1;
+                sys.L3[i].read_accesses=1;
+                sys.L3[i].write_accesses=1;
+                sys.L3[i].total_hits=1;
+                sys.L3[i].total_misses=1;
+                sys.L3[i].read_hits=1;
+                sys.L3[i].write_hits=1;
+                sys.L3[i].read_misses=1;
+                sys.L3[i].write_misses=1;
+                sys.L3[i].replacements=1;
+                sys.L3[i].write_backs=1;
+                sys.L3[i].miss_buffer_accesses=1;
+                sys.L3[i].fill_buffer_accesses=1;
+                sys.L3[i].prefetch_buffer_accesses=1;
+                sys.L3[i].prefetch_buffer_writes=1;
+                sys.L3[i].prefetch_buffer_reads=1;
+                sys.L3[i].prefetch_buffer_hits=1;
+                sys.L3[i].wbb_writes=1;
+                sys.L3[i].wbb_reads=1;
+                sys.L3[i].duty_cycle =1;
+                sys.L3[i].merged_dir=false;
+                sys.L3[i].homenode_read_accesses =1;
+                sys.L3[i].homenode_write_accesses=1;
+                sys.L3[i].homenode_read_hits=1;
+                sys.L3[i].homenode_write_hits=1;
+                sys.L3[i].homenode_read_misses=1;
+                sys.L3[i].homenode_write_misses=1;
+                sys.L3[i].dir_duty_cycle=1;
+        }
+        //system_NoC
+        for (i=0; i<=63; i++)
+        {
+                sys.NoC[i].clockrate=1;
+                sys.NoC[i].type=true;
+                sys.NoC[i].chip_coverage=1;
+                sys.NoC[i].has_global_link = true;
+                strcpy(sys.NoC[i].topology,"default");
+                sys.NoC[i].horizontal_nodes=1;
+                sys.NoC[i].vertical_nodes=1;
+                sys.NoC[i].input_ports=1;
+                sys.NoC[i].output_ports=1;
+                sys.NoC[i].virtual_channel_per_port=1;
+                sys.NoC[i].flit_bits=1;
+                sys.NoC[i].input_buffer_entries_per_vc=1;
+                sys.NoC[i].total_accesses=1;
+                sys.NoC[i].duty_cycle=1;
+                sys.NoC[i].route_over_perc = 0.5;
+                for (j=0; j<20; j++) sys.NoC[i].ports_of_input_buffer[j]=1;
+                sys.NoC[i].number_of_crossbars=1;
+                strcpy(sys.NoC[i].crossbar_type,"default");
+                strcpy(sys.NoC[i].crosspoint_type,"default");
+                //system.NoC?.xbar0;
+                sys.NoC[i].xbar0.number_of_inputs_of_crossbars=1;
+                sys.NoC[i].xbar0.number_of_outputs_of_crossbars=1;
+                sys.NoC[i].xbar0.flit_bits=1;
+                sys.NoC[i].xbar0.input_buffer_entries_per_port=1;
+                sys.NoC[i].xbar0.ports_of_input_buffer[20]=1;
+                sys.NoC[i].xbar0.crossbar_accesses=1;
+        }
+        //system_mem
+        sys.mem.mem_tech_node=1;
+        sys.mem.device_clock=1;
+        sys.mem.capacity_per_channel=1;
+        sys.mem.number_ranks=1;
+        sys.mem.peak_transfer_rate =1;
+        sys.mem.num_banks_of_DRAM_chip=1;
+        sys.mem.Block_width_of_DRAM_chip=1;
+        sys.mem.output_width_of_DRAM_chip=1;
+        sys.mem.page_size_of_DRAM_chip=1;
+        sys.mem.burstlength_of_DRAM_chip=1;
+        sys.mem.internal_prefetch_of_DRAM_chip=1;
+        sys.mem.memory_accesses=1;
+        sys.mem.memory_reads=1;
+        sys.mem.memory_writes=1;
+        //system_mc
+        sys.mc.mc_clock =1;
+        sys.mc.number_mcs=1;
+        sys.mc.peak_transfer_rate =1;
+        sys.mc.memory_channels_per_mc=1;
+        sys.mc.number_ranks=1;
+        sys.mc.req_window_size_per_channel=1;
+        sys.mc.IO_buffer_size_per_channel=1;
+        sys.mc.databus_width=1;
+        sys.mc.addressbus_width=1;
+        sys.mc.memory_accesses=1;
+        sys.mc.memory_reads=1;
+        sys.mc.memory_writes=1;
+        sys.mc.LVDS=true;
+        sys.mc.type=1;
+        //system_niu
+        sys.niu.clockrate =1;
+        sys.niu.number_units=1;
+        sys.niu.type = 1;
+        sys.niu.duty_cycle =1;
+        sys.niu.total_load_perc=1;
+        //system_pcie
+        sys.pcie.clockrate =1;
+        sys.pcie.number_units=1;
+        sys.pcie.num_channels=1;
+        sys.pcie.type = 1;
+        sys.pcie.withPHY = false;
+        sys.pcie.duty_cycle =1;
+        sys.pcie.total_load_perc=1;
+        //system_flash_controller
+        sys.flashc.mc_clock =1;
+        sys.flashc.number_mcs=1;
+        sys.flashc.peak_transfer_rate =1;
+        sys.flashc.memory_channels_per_mc=1;
+        sys.flashc.number_ranks=1;
+        sys.flashc.req_window_size_per_channel=1;
+        sys.flashc.IO_buffer_size_per_channel=1;
+        sys.flashc.databus_width=1;
+        sys.flashc.addressbus_width=1;
+        sys.flashc.memory_accesses=1;
+        sys.flashc.memory_reads=1;
+        sys.flashc.memory_writes=1;
+        sys.flashc.LVDS=true;
+        sys.flashc.withPHY = false;
+        sys.flashc.type =1;
+        sys.flashc.duty_cycle =1;
+        sys.flashc.total_load_perc=1;
+}
diff --git a/ext/mcpat/XML_Parse.h b/ext/mcpat/XML_Parse.h
new file mode 100644 (file)
index 0000000..88fd3da
--- /dev/null
@@ -0,0 +1,591 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef XML_PARSE_H_
+#define XML_PARSE_H_
+
+
+//#ifdef WIN32
+//#define _CRT_SECURE_NO_DEPRECATE
+//#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include <iostream>
+
+#include "xmlParser.h"
+using namespace std;
+
+/*
+void myfree(char *t); // {free(t);}
+ToXMLStringTool tx,tx2;
+*/
+//all subnodes at the level of system.core(0-n)
+//cache_policy is added into cache property arrays;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+
+typedef struct{
+        int prediction_width;
+        char prediction_scheme[20];
+        int predictor_size;
+        int predictor_entries;
+        int local_predictor_size[20];
+        int local_predictor_entries;
+        int global_predictor_entries;
+        int global_predictor_bits;
+        int chooser_predictor_entries;
+        int chooser_predictor_bits;
+        double predictor_accesses;
+} predictor_systemcore;
+typedef struct{
+        int number_entries;
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        double total_hits;
+        double total_accesses;
+        double total_misses;
+        double conflicts;
+} itlb_systemcore;
+typedef struct{
+        //params
+        double icache_config[20];
+        int buffer_sizes[20];
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double read_misses;
+        double replacements;
+        double read_hits;
+        double total_hits;
+        double total_misses;
+        double miss_buffer_access;
+        double fill_buffer_accesses;
+        double prefetch_buffer_accesses;
+        double prefetch_buffer_writes;
+        double prefetch_buffer_reads;
+        double prefetch_buffer_hits;
+        double conflicts;
+} icache_systemcore;
+typedef struct{
+        //params
+        int number_entries;
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double write_hits;
+        double read_hits;
+        double read_misses;
+        double write_misses;
+        double total_hits;
+        double total_misses;
+        double conflicts;
+} dtlb_systemcore;
+typedef struct{
+        //params
+        double dcache_config[20];
+        int buffer_sizes[20];
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double total_hits;
+        double total_misses;
+        double read_hits;
+        double write_hits;
+        double read_misses;
+        double write_misses;
+        double replacements;
+        double write_backs;
+        double miss_buffer_access;
+        double fill_buffer_accesses;
+        double prefetch_buffer_accesses;
+        double prefetch_buffer_writes;
+        double prefetch_buffer_reads;
+        double prefetch_buffer_hits;
+        double wbb_writes;
+        double wbb_reads;
+        double conflicts;
+} dcache_systemcore;
+typedef struct{
+        //params
+        int BTB_config[20];
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double total_hits;
+        double total_misses;
+        double read_hits;
+        double write_hits;
+        double read_misses;
+        double write_misses;
+        double replacements;
+} BTB_systemcore;
+typedef struct{
+        //all params at the level of system.core(0-n)
+        int clock_rate;
+        bool opt_local;
+        bool x86;
+        int machine_bits;
+        int virtual_address_width;
+        int physical_address_width;
+        int opcode_width;
+        int micro_opcode_width;
+        int instruction_length;
+        int machine_type;
+        int internal_datapath_width;
+        int number_hardware_threads;
+        int fetch_width;
+        int number_instruction_fetch_ports;
+        int decode_width;
+        int issue_width;
+        int peak_issue_width;
+        int commit_width;
+        int pipelines_per_core[20];
+        int pipeline_depth[20];
+        char FPU[20];
+        char divider_multiplier[20];
+        int ALU_per_core;
+        double FPU_per_core;
+        int MUL_per_core;
+        int instruction_buffer_size;
+        int decoded_stream_buffer_size;
+        int instruction_window_scheme;
+        int instruction_window_size;
+        int fp_instruction_window_size;
+        int ROB_size;
+        int archi_Regs_IRF_size;
+        int archi_Regs_FRF_size;
+        int phy_Regs_IRF_size;
+        int phy_Regs_FRF_size;
+        int rename_scheme;
+        int register_windows_size;
+        char LSU_order[20];
+        int store_buffer_size;
+        int load_buffer_size;
+        int memory_ports;
+        char Dcache_dual_pump[20];
+        int RAS_size;
+        int fp_issue_width;
+        int prediction_width;
+        int number_of_BTB;
+        int number_of_BPT;
+
+        //all stats at the level of system.core(0-n)
+        double total_instructions;
+        double int_instructions;
+        double fp_instructions;
+        double branch_instructions;
+        double branch_mispredictions;
+        double committed_instructions;
+        double committed_int_instructions;
+        double committed_fp_instructions;
+        double load_instructions;
+        double store_instructions;
+        double total_cycles;
+        double idle_cycles;
+        double busy_cycles;
+        double instruction_buffer_reads;
+        double instruction_buffer_write;
+        double ROB_reads;
+        double ROB_writes;
+        double rename_accesses;
+        double fp_rename_accesses;
+        double rename_reads;
+        double rename_writes;
+        double fp_rename_reads;
+        double fp_rename_writes;
+        double inst_window_reads;
+        double inst_window_writes;
+        double inst_window_wakeup_accesses;
+        double inst_window_selections;
+        double fp_inst_window_reads;
+        double fp_inst_window_writes;
+        double fp_inst_window_wakeup_accesses;
+        double fp_inst_window_selections;
+        double archi_int_regfile_reads;
+        double archi_float_regfile_reads;
+        double phy_int_regfile_reads;
+        double phy_float_regfile_reads;
+        double phy_int_regfile_writes;
+        double phy_float_regfile_writes;
+        double archi_int_regfile_writes;
+        double archi_float_regfile_writes;
+        double int_regfile_reads;
+        double float_regfile_reads;
+        double int_regfile_writes;
+        double float_regfile_writes;
+        double windowed_reg_accesses;
+        double windowed_reg_transports;
+        double function_calls;
+        double context_switches;
+        double ialu_accesses;
+        double fpu_accesses;
+        double mul_accesses;
+        double cdb_alu_accesses;
+        double cdb_mul_accesses;
+        double cdb_fpu_accesses;
+        double load_buffer_reads;
+        double load_buffer_writes;
+        double load_buffer_cams;
+        double store_buffer_reads;
+        double store_buffer_writes;
+        double store_buffer_cams;
+        double store_buffer_forwards;
+        double main_memory_access;
+        double main_memory_read;
+        double main_memory_write;
+        double pipeline_duty_cycle;
+
+        double IFU_duty_cycle ;
+        double BR_duty_cycle ;
+        double LSU_duty_cycle ;
+        double MemManU_I_duty_cycle;
+        double MemManU_D_duty_cycle ;
+        double ALU_duty_cycle ;
+        double MUL_duty_cycle ;
+        double FPU_duty_cycle ;
+        double ALU_cdb_duty_cycle ;
+        double MUL_cdb_duty_cycle ;
+        double FPU_cdb_duty_cycle ;
+
+        //all subnodes at the level of system.core(0-n)
+        predictor_systemcore predictor;
+        itlb_systemcore itlb;
+        icache_systemcore icache;
+        dtlb_systemcore dtlb;
+        dcache_systemcore dcache;
+        BTB_systemcore BTB;
+
+} system_core;
+typedef struct{
+        //params
+        int Directory_type;
+        double Dir_config[20];
+        int buffer_sizes[20];
+        int clockrate;
+        int ports[20];
+        int device_type;
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        char threeD_stack[20];
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double read_misses;
+        double write_misses;
+        double conflicts;
+        double duty_cycle;
+} system_L1Directory;
+typedef struct{
+        //params
+        int Directory_type;
+        double Dir_config[20];
+        int buffer_sizes[20];
+        int clockrate;
+        int ports[20];
+        int device_type;
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        char threeD_stack[20];
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double read_misses;
+        double write_misses;
+        double conflicts;
+        double duty_cycle;
+} system_L2Directory;
+typedef struct{
+        //params
+        double L2_config[20];
+        int clockrate;
+        int ports[20];
+        int device_type;
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        char threeD_stack[20];
+        int buffer_sizes[20];
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double total_hits;
+        double total_misses;
+        double read_hits;
+        double write_hits;
+        double read_misses;
+        double write_misses;
+        double replacements;
+        double write_backs;
+        double miss_buffer_accesses;
+        double fill_buffer_accesses;
+        double prefetch_buffer_accesses;
+        double prefetch_buffer_writes;
+        double prefetch_buffer_reads;
+        double prefetch_buffer_hits;
+        double wbb_writes;
+        double wbb_reads;
+        double conflicts;
+        double duty_cycle;
+
+        bool   merged_dir;
+        double homenode_read_accesses;
+        double homenode_write_accesses;
+        double homenode_read_hits;
+        double homenode_write_hits;
+        double homenode_read_misses;
+        double homenode_write_misses;
+        double dir_duty_cycle;
+} system_L2;
+typedef struct{
+        //params
+        double L3_config[20];
+        int clockrate;
+        int ports[20];
+        int device_type;
+        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
+        char threeD_stack[20];
+        int buffer_sizes[20];
+        //stats
+        double total_accesses;
+        double read_accesses;
+        double write_accesses;
+        double total_hits;
+        double total_misses;
+        double read_hits;
+        double write_hits;
+        double read_misses;
+        double write_misses;
+        double replacements;
+        double write_backs;
+        double miss_buffer_accesses;
+        double fill_buffer_accesses;
+        double prefetch_buffer_accesses;
+        double prefetch_buffer_writes;
+        double prefetch_buffer_reads;
+        double prefetch_buffer_hits;
+        double wbb_writes;
+        double wbb_reads;
+        double conflicts;
+        double duty_cycle;
+
+        bool   merged_dir;
+        double homenode_read_accesses;
+        double homenode_write_accesses;
+        double homenode_read_hits;
+        double homenode_write_hits;
+        double homenode_read_misses;
+        double homenode_write_misses;
+        double dir_duty_cycle;
+} system_L3;
+typedef struct{
+        //params
+        int number_of_inputs_of_crossbars;
+        int number_of_outputs_of_crossbars;
+        int flit_bits;
+        int input_buffer_entries_per_port;
+        int ports_of_input_buffer[20];
+        //stats
+        double crossbar_accesses;
+} xbar0_systemNoC;
+typedef struct{
+        //params
+        int clockrate;
+        bool type;
+        bool has_global_link;
+        char topology[20];
+        int horizontal_nodes;
+        int vertical_nodes;
+        int link_throughput;
+        int link_latency;
+        int input_ports;
+        int output_ports;
+        int virtual_channel_per_port;
+        int flit_bits;
+        int input_buffer_entries_per_vc;
+        int ports_of_input_buffer[20];
+        int dual_pump;
+        int number_of_crossbars;
+        char crossbar_type[20];
+        char crosspoint_type[20];
+        xbar0_systemNoC xbar0;
+        int arbiter_type;
+        double chip_coverage;
+        //stats
+        double total_accesses;
+        double duty_cycle;
+        double route_over_perc;
+} system_NoC;
+typedef struct{
+        //params
+        int mem_tech_node;
+        int device_clock;
+        int peak_transfer_rate;
+        int internal_prefetch_of_DRAM_chip;
+        int capacity_per_channel;
+        int number_ranks;
+        int num_banks_of_DRAM_chip;
+        int Block_width_of_DRAM_chip;
+        int output_width_of_DRAM_chip;
+        int page_size_of_DRAM_chip;
+        int burstlength_of_DRAM_chip;
+        //stats
+        double memory_accesses;
+        double memory_reads;
+        double memory_writes;
+} system_mem;
+typedef struct{
+        //params
+    //Common Param for mc and fc
+        double peak_transfer_rate;
+        int number_mcs;
+        bool withPHY;
+        int type;
+
+        //FCParam
+        //stats
+        double duty_cycle;
+        double total_load_perc;
+
+        //McParam
+        int mc_clock;
+    int llc_line_length;
+        int memory_channels_per_mc;
+        int number_ranks;
+        int req_window_size_per_channel;
+        int IO_buffer_size_per_channel;
+        int databus_width;
+        int addressbus_width;
+        bool LVDS;
+
+        //stats
+        double memory_accesses;
+        double memory_reads;
+        double memory_writes;
+} system_mc;
+
+typedef struct{
+        //params
+    int clockrate;
+        int number_units;
+        int type;
+        //stats
+        double duty_cycle;
+        double total_load_perc;
+} system_niu;
+
+typedef struct{
+        //params
+    int clockrate;
+        int number_units;
+        int num_channels;
+        int type;
+        bool withPHY;
+        //stats
+        double duty_cycle;
+        double total_load_perc;
+} system_pcie;
+
+typedef struct{
+        //All number_of_* at the level of 'system' Ying 03/21/2009
+        int number_of_cores;
+        int number_of_L1Directories;
+        int number_of_L2Directories;
+        int number_of_L2s;
+        bool Private_L2;
+        int number_of_L3s;
+        int number_of_NoCs;
+        int number_of_dir_levels;
+    int domain_size;
+    int first_level_dir;
+        // All params at the level of 'system'
+        int homogeneous_cores;
+        int homogeneous_L1Directories;
+        int homogeneous_L2Directories;
+        double core_tech_node;
+        int target_core_clockrate;
+        int target_chip_area;
+        int temperature;
+        int number_cache_levels;
+        int L1_property;
+        int L2_property;
+        int homogeneous_L2s;
+        int L3_property;
+        int homogeneous_L3s;
+        int homogeneous_NoCs;
+        int homogeneous_ccs;
+        int Max_area_deviation;
+        int Max_power_deviation;
+        int device_type;
+        bool longer_channel_device;
+        bool Embedded;
+        bool opt_dynamic_power;
+        bool opt_lakage_power;
+        bool opt_clockrate;
+        bool opt_area;
+        int interconnect_projection_type;
+        int machine_bits;
+        int virtual_address_width;
+        int physical_address_width;
+        int virtual_memory_page_size;
+    double total_cycles;
+        //system.core(0-n):3rd level
+        system_core core[64];
+        system_L1Directory L1Directory[64];
+        system_L2Directory L2Directory[64];
+        system_L2 L2[64];
+        system_L3 L3[64];
+    system_NoC NoC[64];
+    system_mem mem;
+        system_mc mc;
+        system_mc flashc;
+        system_niu niu;
+        system_pcie pcie;
+} root_system;
+
+class ParseXML
+{
+public:
+        void parse(char* filepath);
+    void initialize();
+public:
+        root_system sys;
+};
+
+
+#endif /* XML_PARSE_H_ */
+
+
+
+
diff --git a/ext/mcpat/Xeon.xml b/ext/mcpat/Xeon.xml
new file mode 100644 (file)
index 0000000..5342104
--- /dev/null
@@ -0,0 +1,455 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+       <component id="system" name="system">
+               <!--McPAT will skip the components if number is set to 0 -->
+               <param name="number_of_cores" value="2"/>
+               <param name="number_of_L1Directories" value="0"/>
+               <param name="number_of_L2Directories" value="0"/>
+               <param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+               <param name="Private_L2" value="1"/><!--1 Private, 0 shared/coherent -->
+               <param name="number_of_L3s" value="1"/> <!-- This number means how many L3 clusters -->
+               <param name="number_of_NoCs" value="1"/>
+               <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+               <param name="homogeneous_L2s" value="1"/>
+               <param name="homogeneous_L1Directorys" value="1"/>
+               <param name="homogeneous_L2Directorys" value="1"/>
+               <param name="homogeneous_L3s" value="1"/>
+               <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+               <param name="homogeneous_NoCs" value="1"/>
+               <param name="core_tech_node" value="65"/><!-- nm -->
+               <param name="target_core_clockrate" value="3400"/><!--MHz -->
+               <param name="temperature" value="380"/> <!-- Kelvin -->
+               <param name="number_cache_levels" value="3"/>
+               <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+               <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+               <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+               <param name="machine_bits" value="64"/>
+               <param name="virtual_address_width" value="64"/>
+               <param name="physical_address_width" value="52"/>
+               <param name="virtual_memory_page_size" value="4096"/>
+               <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+                       default value is machine_bits, if not set --> 
+               <stat name="total_cycles" value="100000"/>
+               <stat name="idle_cycles" value="0"/>
+               <stat name="busy_cycles"  value="100000"/>
+                       <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+                       virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+               <!-- *********************** cores ******************* -->
+               <component id="system.core0" name="core0">
+                       <!-- Core property -->
+                       <param name="clock_rate" value="3400"/>
+                       <!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+                       <param name="opt_local" value="0"/>
+                       <param name="instruction_length" value="32"/>
+                       <param name="opcode_width" value="16"/>
+                       <param name="x86" value="1"/>
+                       <param name="micro_opcode_width" value="8"/>
+                       <param name="machine_type" value="0"/>
+                       <!-- inorder/OoO; 1 inorder; 0 OOO-->
+                       <param name="number_hardware_threads" value="2"/>
+                       <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+                       it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+                       branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+                       <param name="fetch_width" value="4"/>
+                       <!-- fetch_width determins the size of cachelines of L1 cache block -->
+                       <param name="number_instruction_fetch_ports" value="1"/>
+                       <param name="decode_width" value="4"/>
+                       <!-- decode_width determins the number of ports of the 
+                       renaming table (both RAM and CAM) scheme -->
+                       <param name="issue_width" value="4"/>
+                       <param name="peak_issue_width" value="6"/>
+                       <!-- issue_width determins the number of ports of Issue window and other logic 
+                       as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+                       <param name="commit_width" value="4"/>
+                       <!-- commit_width determins the number of ports of register files -->
+                       <param name="fp_issue_width" value="2"/>
+                       <param name="prediction_width" value="1"/> 
+                       <!-- number of branch instructions can be predicted simultannouesl-->
+                       <!-- Current version of McPAT does not distinguish int and floating point pipelines 
+                       Theses parameters are reserved for future use.--> 
+                       <param name="pipelines_per_core" value="1,1"/>
+                       <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+                       <param name="pipeline_depth" value="31,31"/>
+                       <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+                       <!-- issue and exe unit-->
+                       <param name="ALU_per_core" value="6"/>
+                       <!-- contains an adder, a shifter, and a logical unit -->
+                       <param name="MUL_per_core" value="1"/>
+                       <!-- For MUL and Div -->
+                       <param name="FPU_per_core" value="2"/>          
+                       <!-- buffer between IF and ID stage -->
+                       <param name="instruction_buffer_size" value="32"/>
+                       <!-- buffer between ID and sche/exe stage -->
+                       <param name="decoded_stream_buffer_size" value="16"/>
+                       <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+                       <!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+                       <param name="instruction_window_size" value="64"/>
+                       <param name="fp_instruction_window_size" value="64"/>
+                       <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+                       <param name="ROB_size" value="128"/>
+                       <!-- each in-flight instruction has an entry in ROB -->
+                       <!-- registers -->
+                       <param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->                 
+                       <param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
+                       <!--  if OoO processor, phy_reg number is needed for renaming logic, 
+                       renaming logic is for both integer and floating point insts.  -->
+                       <param name="phy_Regs_IRF_size" value="256"/>
+                       <param name="phy_Regs_FRF_size" value="256"/>
+                       <!-- rename logic -->
+                       <param name="rename_scheme" value="0"/>
+                       <!-- can be RAM based(0) or CAM based(1) rename scheme 
+                       RAM-based scheme will have free list, status table;
+                       CAM-based scheme have the valid bit in the data field of the CAM 
+                       both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+                       Detailed RAT Implementation see TR -->
+                       <param name="register_windows_size" value="0"/>
+                       <!-- how many windows in the windowed register file, sun processors;
+                       no register windowing is used when this number is 0 -->
+                       <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+                       They will always try to exeute out-of-order though. -->
+                       <param name="LSU_order" value="inorder"/>
+                       <param name="store_buffer_size" value="96"/>
+                       <!-- By default, in-order cores do not have load buffers -->
+                       <param name="load_buffer_size" value="48"/>     
+                       <!-- number of ports refer to sustainable concurrent memory accesses --> 
+                       <param name="memory_ports" value="2"/>  
+                       <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+                       as well as the ports of Dcache which is connected to LSU -->    
+                       <!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+                       <param name="RAS_size" value="64"/>                                             
+                       <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+                       <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+                       <stat name="total_instructions" value="400000"/>
+                       <stat name="int_instructions" value="200000"/>
+                       <stat name="fp_instructions" value="100000"/>
+                       <stat name="branch_instructions" value="100000"/>
+                       <stat name="branch_mispredictions" value="0"/>
+                       <stat name="load_instructions" value="0"/>
+                       <stat name="store_instructions" value="50000"/>
+                       <stat name="committed_instructions" value="400000"/>
+                       <stat name="committed_int_instructions" value="200000"/>
+                       <stat name="committed_fp_instructions" value="100000"/>
+                       <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+                       <!-- the following cycle stats are used for heterogeneouse cores only, 
+                               please ignore them if homogeneouse cores -->
+                       <stat name="total_cycles" value="100000"/>
+                   <stat name="idle_cycles" value="0"/>
+                   <stat name="busy_cycles"  value="100000"/>
+                       <!-- instruction buffer stats -->
+                       <!-- ROB stats, both RS and Phy based OoOs have ROB
+                       performance simulator should capture the difference on accesses,
+                       otherwise, McPAT has to guess based on number of commited instructions. -->
+                       <stat name="ROB_reads" value="400000"/>
+                       <stat name="ROB_writes" value="400000"/>
+                       <!-- RAT accesses -->
+                       <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+                       <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+                       <stat name="fp_rename_reads" value="200000"/>
+                       <stat name="fp_rename_writes" value="100000"/>
+                       <!-- decode and rename stage use this, should be total ic - nop -->
+                       <!-- Inst window stats -->
+                       <stat name="inst_window_reads" value="400000"/>
+                       <stat name="inst_window_writes" value="400000"/>
+                       <stat name="inst_window_wakeup_accesses" value="800000"/>
+                       <stat name="fp_inst_window_reads" value="200000"/>
+                       <stat name="fp_inst_window_writes" value="200000"/>
+                       <stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+                       <!--  RF accesses -->
+                       <stat name="int_regfile_reads" value="600000"/>
+                       <stat name="float_regfile_reads" value="100000"/>
+                       <stat name="int_regfile_writes" value="300000"/>
+                       <stat name="float_regfile_writes" value="50000"/>
+                       <!-- accesses to the working reg -->
+                       <stat name="function_calls" value="5"/>
+                       <stat name="context_switches" value="260343"/>
+                       <!-- Number of Windowes switches (number of function calls and returns)-->
+                       <!-- Alu stats by default, the processor has one FPU that includes the divider and 
+                        multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+                       <stat name="ialu_accesses" value="300000"/>                     
+                       <stat name="fpu_accesses" value="100000"/>
+                       <stat name="mul_accesses" value="200000"/>
+                       <stat name="cdb_alu_accesses" value="300000"/>
+                       <stat name="cdb_mul_accesses" value="200000"/>
+                       <stat name="cdb_fpu_accesses" value="100000"/>
+                       <!-- multiple cycle accesses should be counted multiple times, 
+                       otherwise, McPAT can use internal counter for different floating point instructions 
+                       to get final accesses. But that needs detailed info for floating point inst mix -->
+                       <!--  currently the performance simulator should 
+                       make sure all the numbers are final numbers, 
+                       including the explicit read/write accesses, 
+                       and the implicite accesses such as replacements and etc.
+                       Future versions of McPAT may be able to reason the implicite access
+                       based on param and stats of last level cache
+                       The same rule applies to all cache access stats too!  -->
+                       <!-- following is AF for max power computation. 
+                               Do not change them, unless you understand them-->
+                       <stat name="IFU_duty_cycle" value="1"/>                 
+                       <stat name="LSU_duty_cycle" value="0.5"/>
+                       <stat name="MemManU_I_duty_cycle" value="1"/>
+                       <stat name="MemManU_D_duty_cycle" value="0.5"/>
+                       <stat name="ALU_duty_cycle" value="1"/>
+                       <stat name="MUL_duty_cycle" value="0.3"/>
+                       <stat name="FPU_duty_cycle" value="0.3"/>
+                       <stat name="ALU_cdb_duty_cycle" value="1"/>
+                       <stat name="MUL_cdb_duty_cycle" value="0.3"/>
+                       <stat name="FPU_cdb_duty_cycle" value="0.3"/>
+                       <param name="number_of_BPT" value="2"/>
+                       <component id="system.core0.predictor" name="PBT">
+                               <!-- branch predictor; tournament predictor see Alpha implementation -->
+                               <param name="local_predictor_size" value="10,3"/>
+                               <param name="local_predictor_entries" value="1024"/>
+                               <param name="global_predictor_entries" value="4096"/>
+                               <param name="global_predictor_bits" value="2"/>
+                               <param name="chooser_predictor_entries" value="4096"/>
+                               <param name="chooser_predictor_bits" value="2"/>
+                               <!-- These parameters can be combined like below in next version
+                               <param name="load_predictor" value="10,3,1024"/>
+                               <param name="global_predictor" value="4096,2"/>
+                               <param name="predictor_chooser" value="4096,2"/>
+                               -->
+                       </component>
+                       <component id="system.core0.itlb" name="itlb">
+                               <param name="number_entries" value="128"/>
+                               <stat name="total_accesses" value="200000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                               <!-- there is no write requests to itlb although writes happen to itlb after miss, 
+                               which is actually a replacement -->
+                       </component>
+                       <component id="system.core0.icache" name="icache">
+                               <!-- there is no write requests to itlb although writes happen to it after miss, 
+                               which is actually a replacement -->
+                               <param name="icache_config" value="131072,32,8,1,8,3,32,0"/>
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+                               <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+                               <param name="buffer_sizes" value="16, 16, 16,0"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="read_misses" value="0"/>
+                               <stat name="conflicts" value="0"/>                              
+                       </component>
+                       <component id="system.core0.dtlb" name="dtlb">
+                               <param name="number_entries" value="128"/><!--dual threads-->
+                               <stat name="total_accesses" value="400000"/>
+                               <stat name="total_misses" value="4"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <component id="system.core0.dcache" name="dcache">
+                               <!-- all the buffer related are optional -->
+                               <param name="dcache_config" value="16384,16,4,1, 3,3, 16,1 "/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                       </component>
+                       <param name="number_of_BTB" value="2"/>
+                       <component id="system.core0.BTB" name="BTB">
+                               <!-- all the buffer related are optional -->
+                               <param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
+                               <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+                               <stat name="write_accesses" value="0"/>
+                       </component>
+       </component>
+               <component id="system.L1Directory0" name="L1Directory0">
+                               <param name="Directory_type" value="0"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="800000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="20"/>     
+               </component>
+               <component id="system.L2Directory0" name="L2Directory0">
+                               <param name="Directory_type" value="1"/>
+                           <!--0 cam based shadowed tag. 1 directory cache --> 
+                               <param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                           <param name="buffer_sizes" value="8, 8, 8, 8"/>     
+                               <!-- all the buffer related are optional -->
+                           <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw search ports -->
+                               <param name="device_type" value="0"/>
+                               <!-- altough there are multiple access types, 
+                               Performance simulator needs to cast them into reads or writes
+                               e.g. the invalidates can be considered as writes -->
+                               <stat name="read_accesses" value="58824"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="100"/>    
+               </component>
+               <component id="system.L20" name="L20">
+                       <!-- all the buffer related are optional -->
+                               <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
+                               <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <param name="clockrate" value="3400"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <stat name="read_accesses" value="200000"/>
+                               <stat name="write_accesses" value="27276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                           <stat name="duty_cycle" value="1.0"/>       
+               </component>
+               
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+                               <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+                               <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+                               <param name="clockrate" value="850"/>
+                               <param name="ports" value="1,1,1"/>
+                               <!-- number of r, w, and rw ports -->
+                               <param name="device_type" value="0"/>
+                               <param name="buffer_sizes" value="16, 16, 16, 16"/>
+                               <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->      
+                               <stat name="read_accesses" value="11824"/>
+                               <stat name="write_accesses" value="11276"/>
+                               <stat name="read_misses" value="1632"/>
+                               <stat name="write_misses" value="183"/>
+                               <stat name="conflicts" value="0"/>      
+                               <stat name="duty_cycle" value="1.0"/>   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.NoC0" name="noc0">
+                       <param name="clockrate" value="3400"/>
+                       <param name="type" value="0"/>
+                       <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+                               at each time only one node can send req -->
+                       <param name="horizontal_nodes" value="1"/>
+                       <param name="vertical_nodes" value="1"/>
+                       <param name="has_global_link" value="0"/>
+                       <!-- 1 has global link, 0 does not have global link -->
+                       <param name="link_throughput" value="1"/><!--w.r.t clock -->
+                       <param name="link_latency" value="1"/><!--w.r.t clock -->
+                       <!-- througput >= latency -->
+                       <!-- Router architecture -->
+                       <param name="input_ports" value="1"/>
+                       <param name="output_ports" value="1"/>
+                       <!-- For bus the I/O ports should be 1 -->
+                       <param name="flit_bits" value="256"/>
+                       <param name="chip_coverage" value="1"/>
+                       <!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+                               chip_coverage <=1 -->
+                       <param name="link_routing_over_percentage" value="0.5"/>
+                       <!-- Links can route over other components or occupy whole area.
+                               by default, 50% of the NoC global links routes over other 
+                               components -->
+                       <stat name="total_accesses" value="100000"/>
+                       <!-- This is the number of total accesses within the whole network not for each router -->
+                       <stat name="duty_cycle" value="1"/>
+               </component>            
+<!--**********************************************************************-->
+               <component id="system.mem" name="mem">
+                       <!-- Main memory property -->
+                       <param name="mem_tech_node" value="32"/>
+                       <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+                       <param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+                       <param name="internal_prefetch_of_DRAM_chip" value="4"/>
+                       <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+                       <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+                       <!-- above numbers can be easily found from Wikipedia -->
+                       <param name="capacity_per_channel" value="4096"/> <!-- MB -->
+                       <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+                       Current McPAT assumes single DIMMs are used.-->                 
+                       <param name="number_ranks" value="2"/>
+                       <param name="num_banks_of_DRAM_chip" value="8"/>                        
+                       <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+                       <param name="output_width_of_DRAM_chip" value="8"/>
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+                       <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+                       <param name="burstlength_of_DRAM_chip" value="8"/>
+                       <stat name="memory_accesses" value="1052"/>
+                       <stat name="memory_reads" value="1052"/>
+                       <stat name="memory_writes" value="1052"/>                                                                       
+               </component>
+               <component id="system.mc" name="mc">
+                       <!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+                       <!-- current version of McPAT uses published values for base parameters of memory controller
+                       improvments on MC will be added in later versions. -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+                       <param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+                       <param name="block_size" value="64"/><!--B-->
+                       <param name="number_mcs" value="0"/>
+                       <!-- current McPAT only supports homogeneous memory controllers -->
+                       <param name="memory_channels_per_mc" value="1"/>
+                       <param name="number_ranks" value="2"/>
+                       <param name="withPHY" value="0"/>
+                       <!-- # of ranks of each channel-->
+                       <param name="req_window_size_per_channel" value="32"/>
+                       <param name="IO_buffer_size_per_channel" value="32"/>
+                       <param name="databus_width" value="128"/>
+                       <param name="addressbus_width" value="51"/>
+                       <!-- McPAT will add the control bus width to the addressbus width automatically -->
+                       <stat name="memory_accesses" value="33333"/>
+                       <stat name="memory_reads" value="16667"/>
+                       <stat name="memory_writes" value="16667"/>
+                       <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+                       the average power per MC or per channel. This is sufficent for most application. 
+                       Further trackdown can be easily added in later versions. -->                    
+               </component>
+<!--**********************************************************************-->
+               <component id="system.niu" name="niu">
+                       <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+                       <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+                                the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+                       the average power per nic or per channel. This is sufficent for most application. -->                   
+               </component>
+<!--**********************************************************************-->
+               <component id="system.pcie" name="pcie">
+                       <!-- On chip PCIe controller, including Phy-->
+                       <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+                                the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+                       <param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+                       <param name="withPHY" value="1"/>
+                       <param name="clockrate" value="350"/>
+                       <param name="number_units" value="0"/>
+                       <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+                       the average power per pcie controller or per channel. This is sufficent for most application. -->                       
+               </component>
+<!--**********************************************************************-->
+               <component id="system.flashc" name="flashc">
+                   <param name="number_flashcs" value="0"/>
+                       <param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+                       <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+                       <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+                       <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+                       <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+                       the average power per fc or per channel. This is sufficent for most application -->                     
+               </component>
+<!--**********************************************************************-->
+
+               </component>
+</component>
+
diff --git a/ext/mcpat/arch_const.h b/ext/mcpat/arch_const.h
new file mode 100644 (file)
index 0000000..b0dfeaa
--- /dev/null
@@ -0,0 +1,276 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef ARCH_CONST_H_
+#define ARCH_CONST_H_
+
+typedef struct{
+        unsigned int capacity;
+        unsigned int assoc;//fully
+        unsigned int blocksize;
+} array_inputs;
+
+//Do Not change, unless you want to bypass the XML interface and do not care about the default values.
+//Global parameters
+const int                      number_of_cores =       8;
+const int                      number_of_L2s   =       1;
+const int                      number_of_L3s   =       1;
+const int                      number_of_NoCs  =       1;
+
+const double           archi_F_sz_nm   =       90.0;
+const unsigned int     dev_type                =       0;
+const double           CLOCKRATE               =       1.2*1e9;
+const double           AF                              =       0.5;
+//const bool                   inorder                 =       true;
+const bool                     embedded                =       false; //NEW
+
+const bool                     homogeneous_cores       =       true;
+const bool                     temperature             =       360;
+const int                      number_cache_levels     =       3;
+const int                      L1_property             =       0; //private 0; coherent 1, shared 2.
+const int                      L2_property             =       2;
+const bool             homogeneous_L2s =       true;
+const bool                 L3_property         =       2;
+const bool                     homogeneous_L3s =       true;
+const double           Max_area_deviation      =       50;
+const double       Max_dynamic_deviation       =50; //New
+const int                      opt_dynamic_power       =       1;
+const int                      opt_lakage_power        =       0;
+const int                      opt_area                        =       0;
+const int                      interconnect_projection_type    =       0;
+
+//******************************Core Parameters
+#if (inorder)
+const int opcode_length                        =       8;//Niagara
+const int reg_length                   =       5;//Niagara
+const int instruction_length   =       32;//Niagara
+const int data_width                   =       64;
+#else
+const int opcode_length                        =       8;//16;//Niagara
+const int reg_length                   =       7;//Niagara
+const int instruction_length   =       32;//Niagara
+const int data_width                   =       64;
+#endif
+
+
+//Caches
+//itlb
+const int itlbsize=512;
+const int itlbassoc=0;//fully
+const int itlbblocksize=8;
+//icache
+const int icachesize=32768;
+const int icacheassoc=4;
+const int icacheblocksize=32;
+//dtlb
+const int dtlbsize=512;
+const int dtlbassoc=0;//fully
+const int dtlbblocksize=8;
+//dcache
+const int dcachesize=32768;
+const int dcacheassoc=4;
+const int dcacheblocksize=32;
+const int dcache_write_buffers=8;
+
+//cache controllers
+//IB,
+const int numIBEntries                 =       64;
+const int IBsize                               =       64;//2*4*instruction_length/8*2;
+const int IBassoc                              =       0;//In Niagara it is still fully associ
+const int IBblocksize                  =       4;
+
+//IFB and MIL should have the same parameters CAM
+const int IFBsize=128;//
+const int IFBassoc=0;//In Niagara it is still fully associ
+const int IFBblocksize=4;
+
+
+
+
+const int icache_write_buffers=8;
+
+//register file RAM
+const int regfilesize=5760;
+const int regfileassoc=1;
+const int regfileblocksize=18;
+//regwin  RAM
+const int regwinsize=256;
+const int regwinassoc=1;
+const int regwinblocksize=8;
+
+
+
+//store buffer, lsq
+const int lsqsize=512;
+const int lsqassoc=0;
+const int lsqblocksize=8;
+
+//data fill queue RAM
+const int dfqsize=1024;
+const int dfqassoc=1;
+const int dfqblocksize=16;
+
+//outside the cores
+//L2 cache bank
+const int l2cachesize=262144;
+const int l2cacheassoc=16;
+const int l2cacheblocksize=64;
+
+//L2 directory
+const int l2dirsize=1024;
+const int l2dirassoc=0;
+const int l2dirblocksize=2;
+
+//crossbar
+//PCX
+const int PCX_NUMBER_INPUT_PORTS_CROSSBAR = 8;
+const int PCX_NUMBER_OUTPUT_PORTS_CROSSBAR = 9;
+const int PCX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =144;
+//PCX buffer RAM
+const int pcx_buffersize=1024;
+const int pcx_bufferassoc=1;
+const int pcx_bufferblocksize=32;
+const int pcx_numbuffer=5;
+//pcx arbiter
+const int pcx_arbsize=128;
+const int pcx_arbassoc=1;
+const int pcx_arbblocksize=2;
+const int pcx_numarb=5;
+
+//CPX
+const int CPX_NUMBER_INPUT_PORTS_CROSSBAR = 5;
+const int CPX_NUMBER_OUTPUT_PORTS_CROSSBAR = 8;
+const int CPX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =150;
+//CPX buffer RAM
+const int cpx_buffersize=1024;
+const int cpx_bufferassoc=1;
+const int cpx_bufferblocksize=32;
+const int cpx_numbuffer=8;
+//cpx arbiter
+const int cpx_arbsize=128;
+const int cpx_arbassoc=1;
+const int cpx_arbblocksize=2;
+const int cpx_numarb=8;
+
+
+
+
+
+const int numPhysFloatRegs=256;
+const int numPhysIntRegs=32;
+const int numROBEntries=192;
+const int umRobs=1;
+
+const int BTBEntries=4096;
+const int BTBTagSize=16;
+const int LFSTSize=1024;
+const int LQEntries=32;
+const int RASSize=16;
+const int SQEntries=32;
+const int SSITSize=1024;
+const int activity=0;
+const int backComSize=5;
+const int cachePorts=200;
+const int choiceCtrBits=2;
+const int choicePredictorSize=8192;
+
+
+const int commitWidth=8;
+const int decodeWidth=8;
+const int dispatchWidth=8;
+const int fetchWidth=8;
+const int issueWidth=1;
+const int renameWidth=8;
+//what is this forwardComSize=5??
+
+const int globalCtrBits=2;
+const int globalHistoryBits=13;
+const int globalPredictorSize=8192;
+
+
+
+const int localCtrBits=2;
+const int localHistoryBits=11;
+const int localHistoryTableSize=2048;
+const int localPredictorSize=2048;
+
+const double Woutdrvnandn      =30 *0.09;//(24.0 * LSCALE)
+const double Woutdrvnandp      =12.5 *0.09;//(10.0 * LSCALE)
+const double Woutdrvnorn       =7.5*0.09;//(6.0 * LSCALE)
+const double Woutdrvnorp  =50 * 0.09;//        (40.0 * LSCALE)
+const double Woutdrivern       =60*0.09;//(48.0 * LSCALE)
+const double Woutdriverp       =100 * 0.09;//(80.0 * LSCALE)
+
+/*
+smtCommitPolicy=RoundRobin
+smtFetchPolicy=SingleThread
+smtIQPolicy=Partitioned
+smtIQThreshold=100
+smtLSQPolicy=Partitioned
+smtLSQThreshold=100
+smtNumFetchingThreads=1
+smtROBPolicy=Partitioned
+smtROBThreshold=100
+squashWidth=8
+*/
+
+/*
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10000
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+repl=Null
+
+
+split=false
+split_size=0
+subblock_size=0
+tgts_per_mshr=20
+trace_addr=0
+two_queue=false
+
+cpu_side=system.cpu0.dcache_port
+mem_side=system.tol2bus.port[2]
+*/
+
+//[system.cpu0.dtb]
+//type=AlphaDT
+
+
+#endif /* ARCH_CONST_H_ */
diff --git a/ext/mcpat/array.cc b/ext/mcpat/array.cc
new file mode 100644 (file)
index 0000000..975f82f
--- /dev/null
@@ -0,0 +1,302 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#define  GLOBALVAR
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "area.h"
+#include "array.h"
+#include "decoder.h"
+#include "globalvar.h"
+#include "parameter.h"
+
+using namespace std;
+
+ArrayST::ArrayST(const InputParameter *configure_interface,
+                               string _name,
+                               enum Device_ty device_ty_,
+                               bool opt_local_,
+                               enum Core_type core_ty_,
+                               bool _is_default)
+:l_ip(*configure_interface),
+ name(_name),
+ device_ty(device_ty_),
+ opt_local(opt_local_),
+ core_ty(core_ty_),
+ is_default(_is_default)
+    {
+
+        if (l_ip.cache_sz<64) l_ip.cache_sz=64;
+        l_ip.error_checking();//not only do the error checking but also fill some missing parameters
+        optimize_array();
+
+}
+
+
+void ArrayST::compute_base_power()
+    {
+        //l_ip.out_w               =l_ip.line_sz*8;
+    local_result=cacti_interface(&l_ip);
+
+    }
+
+void ArrayST::optimize_array()
+{
+        list<uca_org_t > candidate_solutions(0);
+        list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
+
+        uca_org_t * temp_res = 0;
+        local_result.valid=false;
+
+        double         throughput=l_ip.throughput, latency=l_ip.latency;
+        double  area_efficiency_threshold = 20.0;
+        bool   throughput_overflow=true, latency_overflow=true;
+        compute_base_power();
+
+        if ((local_result.cycle_time - throughput) <= 1e-10 )
+                throughput_overflow=false;
+        if ((local_result.access_time - latency)<= 1e-10)
+                latency_overflow=false;
+
+        if (opt_for_clk && opt_local)
+        {
+                if (throughput_overflow || latency_overflow)
+                {
+                        l_ip.ed=0;
+
+                        l_ip.delay_wt                = 100;//Fixed number, make sure timing can be satisfied.
+                        l_ip.cycle_time_wt           = 1000;
+
+                        l_ip.area_wt                 = 10;//Fixed number, This is used to exhaustive search for individual components.
+                        l_ip.dynamic_power_wt        = 10;//Fixed number, This is used to exhaustive search for individual components.
+                        l_ip.leakage_power_wt        = 10;
+
+                        l_ip.delay_dev               = 1000000;//Fixed number, make sure timing can be satisfied.
+                        l_ip.cycle_time_dev          = 100;
+
+                        l_ip.area_dev                = 1000000;//Fixed number, This is used to exhaustive search for individual components.
+                        l_ip.dynamic_power_dev       = 1000000;//Fixed number, This is used to exhaustive search for individual components.
+                        l_ip.leakage_power_dev       = 1000000;
+
+                        throughput_overflow=true; //Reset overflow flag before start optimization iterations
+                        latency_overflow=true;
+
+                        temp_res = &local_result; //Clean up the result for optimized for ED^2P
+                        temp_res->cleanup();
+                }
+
+
+                while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10
+                {
+                        compute_base_power();
+
+                        l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration
+
+                        //             from best area to worst area -->worst timing to best timing
+                        if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)||
+                                        (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0))
+                        {  //if no satisfiable solution is found,the most aggressive one is left
+                                candidate_solutions.push_back(local_result);
+                                //output_data_csv(candidate_solutions.back());
+                                if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10))
+                                        //ensure stop opt not because of cam
+                                {
+                                        throughput_overflow=false;
+                                        latency_overflow=false;
+                                }
+
+                        }
+                        else
+                        {
+                                //TODO: whether checking the partial satisfied results too, or just change the mark???
+                                if ((local_result.cycle_time - throughput) <= 1e-10)
+                                                                                throughput_overflow=false;
+                                if ((local_result.access_time - latency)<= 1e-10)
+                                                                                latency_overflow=false;
+
+                                if (l_ip.cycle_time_dev > 10)
+                                {   //if not >10 local_result is the last result, it cannot be cleaned up
+                                        temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up
+                                        temp_res->cleanup();
+                                }
+                        }
+//                     l_ip.cycle_time_dev-=10;
+//                     l_ip.delay_dev-=10;
+
+                }
+
+
+        if (l_ip.assoc > 0)
+        {
+                //For array structures except CAM and FA, Give warning but still provide a result with best timing found
+                if (throughput_overflow==true)
+                        cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl;
+                if (latency_overflow==true)
+                        cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl;
+        }
+
+//     else
+//     {
+//             /*According to "Content-Addressable Memory (CAM) Circuits and
+//                             Architectures": A Tutorial and Survey
+//                             by Kostas Pagiamtzis et al.
+//                             CAM structures can be heavily pipelined and use look-ahead techniques,
+//                             therefore timing can be relaxed. But McPAT does not model the advanced
+//                             techniques. If continue optimizing, the area efficiency will be too low
+//             */
+//             //For CAM and FA, stop opt if area efficiency is too low
+//             if (throughput_overflow==true)
+//                     cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name
+//                             <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
+//             if (latency_overflow==true)
+//                     cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name
+//                             <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
+//     }
+
+                //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time;
+                double min_dynamic_energy=BIGNUM;
+                if (candidate_solutions.empty()==false)
+                {
+                        local_result.valid=true;
+                        for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter)
+
+                        {
+                                if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic)
+                                {
+                                        min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
+                                        min_dynamic_energy_iter = candidate_iter;
+                                        local_result = *(min_dynamic_energy_iter);
+                                        //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match.
+
+                                }
+                                else
+                                {
+                                        candidate_iter->cleanup() ;
+                                }
+
+                        }
+
+
+                }
+        candidate_solutions.clear();
+        }
+
+        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+
+        double macro_layout_overhead   = g_tp.macro_layout_overhead;
+        double chip_PR_overhead        = g_tp.chip_layout_overhead;
+        double total_overhead          = macro_layout_overhead*chip_PR_overhead;
+        local_result.area *= total_overhead;
+
+        //maintain constant power density
+        double pppm_t[4]    = {total_overhead,1,1,total_overhead};
+
+        double sckRation = g_tp.sckt_co_eff;
+        local_result.power.readOp.dynamic *= sckRation;
+        local_result.power.writeOp.dynamic *= sckRation;
+        local_result.power.searchOp.dynamic *= sckRation;
+        local_result.power.readOp.leakage *= l_ip.nbanks;
+        local_result.power.readOp.longer_channel_leakage =
+                local_result.power.readOp.leakage*long_channel_device_reduction;
+        local_result.power = local_result.power* pppm_t;
+
+        local_result.data_array2->power.readOp.dynamic *= sckRation;
+        local_result.data_array2->power.writeOp.dynamic *= sckRation;
+        local_result.data_array2->power.searchOp.dynamic *= sckRation;
+        local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
+        local_result.data_array2->power.readOp.longer_channel_leakage =
+                local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
+        local_result.data_array2->power = local_result.data_array2->power* pppm_t;
+
+
+        if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
+        {
+                local_result.tag_array2->power.readOp.dynamic *= sckRation;
+                local_result.tag_array2->power.writeOp.dynamic *= sckRation;
+                local_result.tag_array2->power.searchOp.dynamic *= sckRation;
+                local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
+                local_result.tag_array2->power.readOp.longer_channel_leakage =
+                        local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
+                local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
+        }
+
+
+}
+
+void ArrayST::leakage_feedback(double temperature)
+{
+  // Update the temperature. l_ip is already set and error-checked in the creator function.
+  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
+
+  // This corresponds to cacti_interface() in the initialization process. Leakage power is updated here.
+  reconfigure(&l_ip,&local_result);
+
+  // Scale the power values. This is part of ArrayST::optimize_array().
+  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+
+  double macro_layout_overhead   = g_tp.macro_layout_overhead;
+  double chip_PR_overhead        = g_tp.chip_layout_overhead;
+  double total_overhead          = macro_layout_overhead*chip_PR_overhead;
+
+  double pppm_t[4]    = {total_overhead,1,1,total_overhead};
+
+  double sckRation = g_tp.sckt_co_eff;
+  local_result.power.readOp.dynamic *= sckRation;
+  local_result.power.writeOp.dynamic *= sckRation;
+  local_result.power.searchOp.dynamic *= sckRation;
+  local_result.power.readOp.leakage *= l_ip.nbanks;
+  local_result.power.readOp.longer_channel_leakage = local_result.power.readOp.leakage*long_channel_device_reduction;
+  local_result.power = local_result.power* pppm_t;
+
+  local_result.data_array2->power.readOp.dynamic *= sckRation;
+  local_result.data_array2->power.writeOp.dynamic *= sckRation;
+  local_result.data_array2->power.searchOp.dynamic *= sckRation;
+  local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
+  local_result.data_array2->power.readOp.longer_channel_leakage = local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
+  local_result.data_array2->power = local_result.data_array2->power* pppm_t;
+
+  if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
+  {
+    local_result.tag_array2->power.readOp.dynamic *= sckRation;
+    local_result.tag_array2->power.writeOp.dynamic *= sckRation;
+    local_result.tag_array2->power.searchOp.dynamic *= sckRation;
+    local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
+    local_result.tag_array2->power.readOp.longer_channel_leakage = local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
+    local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
+  }
+}
+
+ArrayST:: ~ArrayST()
+{
+        local_result.cleanup();
+}
diff --git a/ext/mcpat/array.h b/ext/mcpat/array.h
new file mode 100644 (file)
index 0000000..8c6124d
--- /dev/null
@@ -0,0 +1,101 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef ARRAY_H_
+#define ARRAY_H_
+
+#include <iostream>
+#include <string>
+
+#include "basic_components.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "const.h"
+#include "parameter.h"
+
+using namespace std;
+
+class ArrayST :public Component{
+ public:
+  ArrayST(){};
+  ArrayST(const InputParameter *configure_interface, string _name, enum Device_ty device_ty_, bool opt_local_=true, enum Core_type core_ty_=Inorder,  bool _is_default=true);
+
+  InputParameter l_ip;
+  string         name;
+  enum Device_ty device_ty;
+  bool opt_local;
+  enum Core_type core_ty;
+  bool           is_default;
+  uca_org_t      local_result;
+
+  statsDef       tdp_stats;
+  statsDef       rtp_stats;
+  statsDef       stats_t;
+  powerDef       power_t;
+
+  virtual void optimize_array();
+  virtual void compute_base_power();
+  virtual ~ArrayST();
+
+  void leakage_feedback(double temperature);
+};
+
+class InstCache :public Component{
+public:
+  ArrayST* caches;
+  ArrayST* missb;
+  ArrayST* ifb;
+  ArrayST* prefetchb;
+  powerDef power_t;//temp value holder for both (max) power and runtime power
+  InstCache(){caches=0;missb=0;ifb=0;prefetchb=0;};
+  ~InstCache(){
+          if (caches)    {//caches->local_result.cleanup();
+                                          delete caches; caches=0;}
+          if (missb)     {//missb->local_result.cleanup();
+                                          delete missb; missb=0;}
+          if (ifb)       {//ifb->local_result.cleanup();
+                                          delete ifb; ifb=0;}
+          if (prefetchb) {//prefetchb->local_result.cleanup();
+                                          delete prefetchb; prefetchb=0;}
+   };
+};
+
+class DataCache :public InstCache{
+public:
+  ArrayST* wbb;
+  DataCache(){wbb=0;};
+  ~DataCache(){
+          if (wbb) {//wbb->local_result.cleanup();
+                                delete wbb; wbb=0;}
+   };
+};
+
+#endif /* TLB_H_ */
diff --git a/ext/mcpat/basic_components.cc b/ext/mcpat/basic_components.cc
new file mode 100644 (file)
index 0000000..f288d74
--- /dev/null
@@ -0,0 +1,127 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "basic_components.h"
+
+double longer_channel_device_reduction(
+                enum Device_ty device_ty,
+                enum Core_type core_ty)
+{
+
+        double longer_channel_device_percentage_core;
+        double longer_channel_device_percentage_uncore;
+        double longer_channel_device_percentage_llc;
+
+        double long_channel_device_reduction;
+
+        longer_channel_device_percentage_llc    = 1.0;
+        longer_channel_device_percentage_uncore = 0.82;
+        if (core_ty==OOO)
+        {
+                longer_channel_device_percentage_core   = 0.56;//0.54 Xeon Tulsa //0.58 Nehelam
+                //longer_channel_device_percentage_uncore = 0.76;//0.85 Nehelam
+
+        }
+        else
+        {
+                longer_channel_device_percentage_core   = 0.8;//0.8;//Niagara
+                //longer_channel_device_percentage_uncore = 0.9;//Niagara
+        }
+
+        if (device_ty==Core_device)
+        {
+                long_channel_device_reduction = (1- longer_channel_device_percentage_core)
+                + longer_channel_device_percentage_core * g_tp.peri_global.long_channel_leakage_reduction;
+        }
+        else if (device_ty==Uncore_device)
+        {
+                long_channel_device_reduction = (1- longer_channel_device_percentage_uncore)
+                + longer_channel_device_percentage_uncore * g_tp.peri_global.long_channel_leakage_reduction;
+        }
+        else if (device_ty==LLC_device)
+        {
+                long_channel_device_reduction = (1- longer_channel_device_percentage_llc)
+                + longer_channel_device_percentage_llc * g_tp.peri_global.long_channel_leakage_reduction;
+        }
+        else
+        {
+                cout<<"unknown device category"<<endl;
+                exit(0);
+        }
+
+        return long_channel_device_reduction;
+}
+
+statsComponents operator+(const statsComponents & x, const statsComponents & y)
+{
+        statsComponents z;
+
+        z.access = x.access + y.access;
+        z.hit    = x.hit + y.hit;
+        z.miss   = x.miss  + y.miss;
+
+        return z;
+}
+
+statsComponents operator*(const statsComponents & x, double const * const y)
+{
+        statsComponents z;
+
+        z.access = x.access*y[0];
+        z.hit    = x.hit*y[1];
+        z.miss   = x.miss*y[2];
+
+        return z;
+}
+
+statsDef operator+(const statsDef & x, const statsDef & y)
+{
+        statsDef z;
+
+        z.readAc   = x.readAc  + y.readAc;
+        z.writeAc  = x.writeAc + y.writeAc;
+        z.searchAc  = x.searchAc + y.searchAc;
+        return z;
+}
+
+statsDef operator*(const statsDef & x, double const * const y)
+{
+        statsDef z;
+
+        z.readAc   = x.readAc*y;
+        z.writeAc  = x.writeAc*y;
+        z.searchAc  = x.searchAc*y;
+        return z;
+}
diff --git a/ext/mcpat/basic_components.h b/ext/mcpat/basic_components.h
new file mode 100644 (file)
index 0000000..ce3e639
--- /dev/null
@@ -0,0 +1,265 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef BASIC_COMPONENTS_H_
+#define BASIC_COMPONENTS_H_
+
+#include <vector>
+
+#include "XML_Parse.h"
+#include "parameter.h"
+
+const double cdb_overhead = 1.1;
+
+enum FU_type {
+    FPU,
+    ALU,
+    MUL
+};
+
+enum Core_type {
+        OOO,
+        Inorder
+};
+
+enum Renaming_type {
+    RAMbased,
+        CAMbased
+};
+
+enum Scheduler_type {
+    PhysicalRegFile,
+        ReservationStation
+};
+
+enum cache_level {
+    L2,
+    L3,
+    L1Directory,
+    L2Directory
+};
+
+enum MemoryCtrl_type {
+        MC,    //memory controller
+        FLASHC //flash controller
+};
+
+enum Dir_type {
+        ST,//shadowed tag
+        DC,//directory cache
+        SBT,//static bank tag
+        NonDir
+
+};
+
+enum Cache_policy {
+        Write_through,
+        Write_back
+};
+
+enum Device_ty {
+        Core_device,
+        Uncore_device,
+        LLC_device
+};
+
+class statsComponents
+{
+  public:
+    double access;
+    double hit;
+    double miss;
+
+    statsComponents() : access(0), hit(0), miss(0)  {}
+    statsComponents(const statsComponents & obj) { *this = obj; }
+    statsComponents & operator=(const statsComponents & rhs)
+    {
+      access = rhs.access;
+      hit = rhs.hit;
+      miss  = rhs.miss;
+      return *this;
+    }
+    void reset() { access = 0; hit = 0; miss = 0;}
+
+    friend statsComponents operator+(const statsComponents & x, const statsComponents & y);
+    friend statsComponents operator*(const statsComponents & x, double const * const y);
+};
+
+class statsDef
+{
+  public:
+    statsComponents readAc;
+    statsComponents writeAc;
+    statsComponents searchAc;
+
+    statsDef() : readAc(), writeAc(),searchAc() { }
+    void reset() { readAc.reset(); writeAc.reset();searchAc.reset();}
+
+    friend statsDef operator+(const statsDef & x, const statsDef & y);
+    friend statsDef operator*(const statsDef & x, double const * const y);
+};
+
+double longer_channel_device_reduction(
+                enum Device_ty device_ty=Core_device,
+                enum Core_type core_ty=Inorder);
+
+class CoreDynParam {
+public:
+        CoreDynParam(){};
+        CoreDynParam(ParseXML *XML_interface, int ithCore_);
+        //    :XML(XML_interface),
+        //     ithCore(ithCore_)
+        //     core_ty(inorder),
+        //     rm_ty(CAMbased),
+        //     scheu_ty(PhysicalRegFile),
+        //     clockRate(1e9),//1GHz
+        //     arch_ireg_width(32),
+        //     arch_freg_width(32),
+        //     phy_ireg_width(128),
+        //     phy_freg_width(128),
+        //     perThreadState(8),
+        //     globalCheckpoint(32),
+        //     instructionLength(32){};
+        //ParseXML * XML;
+        bool opt_local;
+        bool x86;
+        bool Embedded;
+    enum Core_type  core_ty;
+        enum Renaming_type rm_ty;
+    enum Scheduler_type scheu_ty;
+    double clockRate,executionTime;
+    int  arch_ireg_width, arch_freg_width, phy_ireg_width, phy_freg_width;
+    int  num_IRF_entry, num_FRF_entry, num_ifreelist_entries, num_ffreelist_entries;
+    int  fetchW, decodeW,issueW,peak_issueW, commitW,peak_commitW, predictionW, fp_issueW, fp_decodeW;
+    int  perThreadState, globalCheckpoint, instruction_length, pc_width, opcode_length, micro_opcode_length;
+    int  num_hthreads, pipeline_stages, fp_pipeline_stages, num_pipelines, num_fp_pipelines;
+    int  num_alus, num_muls;
+    double num_fpus;
+    int  int_data_width, fp_data_width,v_address_width, p_address_width;
+    double pipeline_duty_cycle, total_cycles, busy_cycles, idle_cycles;
+    bool regWindowing,multithreaded;
+    double pppm_lkg_multhread[4];
+        double IFU_duty_cycle,BR_duty_cycle,LSU_duty_cycle,MemManU_I_duty_cycle,
+               MemManU_D_duty_cycle, ALU_duty_cycle,MUL_duty_cycle,
+               FPU_duty_cycle, ALU_cdb_duty_cycle,MUL_cdb_duty_cycle,
+               FPU_cdb_duty_cycle;
+    ~CoreDynParam(){};
+};
+
+class CacheDynParam {
+public:
+        CacheDynParam(){};
+        CacheDynParam(ParseXML *XML_interface, int ithCache_);
+    string name;
+        enum Dir_type    dir_ty;
+        double clockRate,executionTime;
+    double    capacity, blockW, assoc, nbanks;
+    double throughput, latency;
+    double duty_cycle, dir_duty_cycle;
+    //double duty_cycle;
+    int missb_size, fu_size, prefetchb_size, wbb_size;
+    ~CacheDynParam(){};
+};
+
+class MCParam {
+public:
+        MCParam(){};
+        MCParam(ParseXML *XML_interface, int ithCache_);
+    string name;
+    double  clockRate,num_mcs, peakDataTransferRate, num_channels;
+    //  double mcTEPowerperGhz;
+    // double mcPHYperGbit;
+    // double area;
+    int           llcBlockSize, dataBusWidth, addressBusWidth;
+    int    opcodeW;
+    int    memAccesses;
+    int    memRank;
+    int    type;
+    double frontend_duty_cycle, duty_cycle, perc_load;
+    double executionTime, reads, writes;
+    bool   LVDS, withPHY;
+
+    ~MCParam(){};
+};
+
+class NoCParam {
+public:
+        NoCParam(){};
+        NoCParam(ParseXML *XML_interface, int ithCache_);
+    string name;
+    double  clockRate;
+    int           flit_size;
+    int    input_ports, output_ports, min_ports, global_linked_ports;
+    int    virtual_channel_per_port,input_buffer_entries_per_vc;
+    int    horizontal_nodes,vertical_nodes, total_nodes;
+    double executionTime, total_access, link_throughput,link_latency,
+                   duty_cycle, chip_coverage, route_over_perc;
+    bool   has_global_link, type;
+
+    ~NoCParam(){};
+};
+
+class ProcParam {
+public:
+        ProcParam(){};
+        ProcParam(ParseXML *XML_interface, int ithCache_);
+    string name;
+    int  numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir,numMC, numMCChannel;
+    bool homoCore, homoL2, homoL3, homoNOC, homoL1Dir, homoL2Dir;
+
+    ~ProcParam(){};
+};
+
+class NIUParam {
+public:
+        NIUParam(){};
+        NIUParam(ParseXML *XML_interface, int ithCache_);
+    string name;
+    double  clockRate;
+    int    num_units;
+    int    type;
+    double duty_cycle, perc_load;
+    ~NIUParam(){};
+};
+
+class PCIeParam {
+public:
+        PCIeParam(){};
+        PCIeParam(ParseXML *XML_interface, int ithCache_);
+    string name;
+    double  clockRate;
+    int    num_channels, num_units;
+    bool   withPHY;
+    int    type;
+    double duty_cycle, perc_load;
+    ~PCIeParam(){};
+};
+#endif /* BASIC_COMPONENTS_H_ */
diff --git a/ext/mcpat/cacti/README b/ext/mcpat/cacti/README
new file mode 100644 (file)
index 0000000..de429d2
--- /dev/null
@@ -0,0 +1,94 @@
+-----------------------------------------------------------
+          ____    _    ____ _____ ___    __    ____  
+         / ___|  / \  / ___|_   _|_ _|  / /_  | ___| 
+        | |     / _ \| |     | |  | |  | '_ \ |___ \ 
+        | |___ / ___ \ |___  | |  | |  | (_) | ___) |
+         \____/_/   \_\____| |_| |___|  \___(_)____/ 
+
+
+             A Tool to Model Caches/Memories
+-----------------------------------------------------------
+
+CACTI is an analytical tool that takes a set of cache/memory para-
+meters as input and calculates its access time, power, cycle 
+time, and area.
+CACTI was originally developed by Dr. Jouppi and Dr. Wilton
+in 1993 and since then it has undergone five major 
+revisions.
+
+List of features (version 1-6.5):
+===============================
+The following is the list of features supported by the tool. 
+
+* Power, delay, area, and cycle time model for 
+                  direct mapped caches
+                  set-associative caches
+                  fully associative caches
+                  Embedded DRAM memories
+                  Commodity DRAM memories
+                  
+* Support for modeling multi-ported uniform cache access (UCA)
+  and multi-banked, multi-ported non-uniform cache access (NUCA).
+
+* Leakage power calculation that also considers the operating
+  temperature of the cache.
+  
+* Router power model.
+
+* Interconnect model with different delay, power, and area 
+  properties including low-swing wire model.
+
+* An interface to perform trade-off analysis involving power, delay,
+  area, and bandwidth.
+
+* All process specific values used by the tool are obtained
+  from ITRS and currently, the tool supports 90nm, 65nm, 45nm, 
+  and 32nm technology nodes.
+
+Version 6.5 has a new c++ code base and includes numerous bug fixes.
+CACTI 5.3 and 6.0 activate an entire row of mats to read/write a single
+block of data. This technique improves reliability at the cost of  
+power. CACTI 6.5 activates minimum number of mats just enough to retrieve 
+a block to minimize power.
+
+How to use the tool?
+====================
+Prior versions of CACTI take input parameters such as cache
+size and technology node as a set of command line arguments. 
+To avoid a long list of command line arguments, 
+CACTI 6.5 lets users specify their cache model in a more 
+detailed manner by using a config file (cache.cfg).
+
+-> define the cache model using cache.cfg
+-> run the "cacti" binary <./cacti -infile cache.cfg>
+
+CACTI6.5 also provides a command line interface similar to earlier versions
+of CACTI. The command line interface can be used as
+
+./cacti  cache_size line_size associativity rw_ports excl_read_ports excl_write_ports 
+  single_ended_read_ports search_ports banks tech_node output_width specific_tag tag_width
+  access_mode cache main_mem obj_func_delay obj_func_dynamic_power obj_func_leakage_power
+  obj_func_cycle_time obj_func_area dev_func_delay dev_func_dynamic_power dev_func_leakage_power
+  dev_func_area dev_func_cycle_time ed_ed2_none temp wt data_arr_ram_cell_tech_flavor_in
+  data_arr_peri_global_tech_flavor_in tag_arr_ram_cell_tech_flavor_in tag_arr_peri_global_tech_flavor_in
+  interconnect_projection_type_in wire_inside_mat_type_in wire_outside_mat_type_in
+  REPEATERS_IN_HTREE_SEGMENTS_in VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in 
+  BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in PAGE_SIZE_BITS_in BURST_LENGTH_in
+  INTERNAL_PREFETCH_WIDTH_in force_wiretype wiretype force_config ndwl ndbl nspd ndcm 
+  ndsam1 ndsam2 ecc
+
+For complete documentation of the tool, please refer CACTI-5.3 and 6.0
+technical reports and the following paper,
+"Optimizing NUCA Organizations and Wiring Alternatives for 
+Large Caches With CACTI 6.0", that appears in MICRO 2007.
+
+We are still improving the tool and refining the code. If you
+have any comments, questions, or suggestions please write to
+us.
+
+Naveen Muralimanohar             Jung Ho Ahn        Sheng Li
+naveen.muralimanohar@hp.com      gajh@snu.ac.kr     sheng.li@hp.com
+
+
+
+
diff --git a/ext/mcpat/cacti/Ucache.cc b/ext/mcpat/cacti/Ucache.cc
new file mode 100644 (file)
index 0000000..f3e1227
--- /dev/null
@@ -0,0 +1,916 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include <pthread.h>
+
+#include <algorithm>
+#include <cmath>
+#include <ctime>
+#include <iostream>
+#include <list>
+
+#include "Ucache.h"
+#include "area.h"
+#include "bank.h"
+#include "basic_circuit.h"
+#include "component.h"
+#include "const.h"
+#include "decoder.h"
+#include "parameter.h"
+#include "subarray.h"
+#include "uca.h"
+
+using namespace std;
+
+const uint32_t nthreads = NTHREADS;
+
+
+void min_values_t::update_min_values(const min_values_t * val)
+{
+  min_delay   = (min_delay > val->min_delay) ? val->min_delay : min_delay;
+  min_dyn     = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn;
+  min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage;
+  min_area    = (min_area > val->min_area) ? val->min_area : min_area;
+  min_cyc     = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc;
+}
+
+
+
+void min_values_t::update_min_values(const uca_org_t & res)
+{
+  min_delay   = (min_delay > res.access_time) ? res.access_time : min_delay;
+  min_dyn     = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn;
+  min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage;
+  min_area    = (min_area > res.area) ? res.area : min_area;
+  min_cyc     = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc;
+}
+
+void min_values_t::update_min_values(const nuca_org_t * res)
+{
+  min_delay   = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay;
+  min_dyn     = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn;
+  min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage;
+  min_area    = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area;
+  min_cyc     = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc;
+}
+
+void min_values_t::update_min_values(const mem_array * res)
+{
+  min_delay   = (min_delay > res->access_time) ? res->access_time : min_delay;
+  min_dyn     = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn;
+  min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage;
+  min_area    = (min_area > res->area) ? res->area : min_area;
+  min_cyc     = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc;
+}
+
+
+
+void * calc_time_mt_wrapper(void * void_obj)
+{
+  calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj;
+  uint32_t tid                   = calc_obj->tid;
+  list<mem_array *> & data_arr   = calc_obj->data_arr;
+  list<mem_array *> & tag_arr    = calc_obj->tag_arr;
+  bool is_tag                    = calc_obj->is_tag;
+  bool pure_ram                  = calc_obj->pure_ram;
+  bool pure_cam                                         = calc_obj->pure_cam;
+  bool is_main_mem               = calc_obj->is_main_mem;
+  double Nspd_min                = calc_obj->Nspd_min;
+  min_values_t * data_res        = calc_obj->data_res;
+  min_values_t * tag_res         = calc_obj->tag_res;
+
+  data_arr.clear();
+  data_arr.push_back(new mem_array);
+  tag_arr.clear();
+  tag_arr.push_back(new mem_array);
+
+  uint32_t Ndwl_niter = _log2(MAXDATAN) + 1;
+  uint32_t Ndbl_niter = _log2(MAXDATAN) + 1;
+  uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1;
+  uint32_t niter      = Ndwl_niter * Ndbl_niter * Ndcm_niter;
+
+
+  bool is_valid_partition;
+  int wt_min, wt_max;
+
+  if (g_ip->force_wiretype) {
+    if (g_ip->wt == 0) {
+      wt_min = Low_swing;
+      wt_max = Low_swing;
+    }
+    else {
+      wt_min = Global;
+      wt_max = Low_swing-1;
+    }
+  }
+  else {
+    wt_min = Global;
+    wt_max = Low_swing;
+  }
+
+  for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2)
+  {
+    for (int wr = wt_min; wr <= wt_max; wr++)
+    {
+      for (uint32_t iter = tid; iter < niter; iter += nthreads)
+      {
+        // reconstruct Ndwl, Ndbl, Ndcm
+        unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter));
+        unsigned int Ndbl = 1 << ((iter / (Ndcm_niter))%Ndbl_niter);
+        unsigned int Ndcm = 1 << (iter % Ndcm_niter);
+        for(unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX; Ndsam_lev_1 *= 2)
+        {
+          for(unsigned int Ndsam_lev_2 = 1; Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2)
+          {
+            //for debuging
+            if (g_ip->force_cache_config && is_tag == false)
+            {
+              wr   = g_ip->wt;
+              Ndwl = g_ip->ndwl;
+              Ndbl = g_ip->ndbl;
+              Ndcm = g_ip->ndcm;
+              if(g_ip->nspd != 0) {
+                  Nspd = g_ip->nspd;
+              }
+              if(g_ip->ndsam1 != 0) {
+                  Ndsam_lev_1 = g_ip->ndsam1;
+                  Ndsam_lev_2 = g_ip->ndsam2;
+              }
+            }
+
+            if (is_tag == true)
+            {
+              is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl,
+                  Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
+                  tag_arr.back(), 0, NULL, NULL,
+                  is_main_mem);
+            }
+            // If it's a fully-associative cache, the data array partition parameters are identical to that of
+            // the tag array, so compute data array partition properties also here.
+            if (is_tag == false || g_ip->fully_assoc)
+            {
+              is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl,
+                  Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
+                  data_arr.back(), 0, NULL, NULL,
+                  is_main_mem);
+            }
+
+            if (is_valid_partition)
+            {
+              if (is_tag == true)
+              {
+                tag_arr.back()->wt = (enum Wire_type) wr;
+                tag_res->update_min_values(tag_arr.back());
+                tag_arr.push_back(new mem_array);
+              }
+              if (is_tag == false || g_ip->fully_assoc)
+              {
+                data_arr.back()->wt = (enum Wire_type) wr;
+                data_res->update_min_values(data_arr.back());
+                data_arr.push_back(new mem_array);
+              }
+            }
+
+            if (g_ip->force_cache_config && is_tag == false)
+            {
+                wr   = wt_max;
+                iter = niter;
+                if(g_ip->nspd != 0) {
+                        Nspd = MAXDATASPD;
+                }
+                if (g_ip->ndsam1 != 0) {
+                        Ndsam_lev_1 = MAX_COL_MUX+1;
+                        Ndsam_lev_2 = MAX_COL_MUX+1;
+                }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  delete data_arr.back();
+  delete tag_arr.back();
+  data_arr.pop_back();
+  tag_arr.pop_back();
+
+  pthread_exit(NULL);
+}
+
+
+
+bool calculate_time(
+    bool is_tag,
+    int pure_ram,
+    bool pure_cam,
+    double Nspd,
+    unsigned int Ndwl,
+    unsigned int Ndbl,
+    unsigned int Ndcm,
+    unsigned int Ndsam_lev_1,
+    unsigned int Ndsam_lev_2,
+    mem_array *ptr_array,
+    int flag_results_populate,
+    results_mem_array *ptr_results,
+    uca_org_t *ptr_fin_res,
+    bool is_main_mem)
+{
+  DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
+
+  if (dyn_p.is_valid == false)
+  {
+    return false;
+  }
+
+  UCA * uca = new UCA(dyn_p);
+
+
+  if (flag_results_populate)
+  { //For the final solution, populate the ptr_results data structure  -- TODO: copy only necessary variables
+  }
+  else
+  {
+          int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir;
+          int num_mats = uca->bank.dp.num_mats;
+          bool is_fa = uca->bank.dp.fully_assoc;
+          bool pure_cam = uca->bank.dp.pure_cam;
+        ptr_array->Ndwl = Ndwl;
+    ptr_array->Ndbl = Ndbl;
+    ptr_array->Nspd = Nspd;
+    ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing;
+    ptr_array->Ndsam_lev_1 = Ndsam_lev_1;
+    ptr_array->Ndsam_lev_2 = Ndsam_lev_2;
+    ptr_array->access_time = uca->access_time;
+    ptr_array->cycle_time = uca->cycle_time;
+    ptr_array->multisubbank_interleave_cycle_time = uca->multisubbank_interleave_cycle_time;
+    ptr_array->area_ram_cells = uca->area_all_dataramcells;
+    ptr_array->area   = uca->area.get_area();
+    ptr_array->height = uca->area.h;
+    ptr_array->width  = uca->area.w;
+    ptr_array->mat_height = uca->bank.mat.area.h;
+    ptr_array->mat_length = uca->bank.mat.area.w;
+    ptr_array->subarray_height = uca->bank.mat.subarray.area.h;
+    ptr_array->subarray_length = uca->bank.mat.subarray.area.w;
+    ptr_array->power  = uca->power;
+    ptr_array->delay_senseamp_mux_decoder =
+      MAX(uca->delay_array_to_sa_mux_lev_1_decoder,
+          uca->delay_array_to_sa_mux_lev_2_decoder);
+    ptr_array->delay_before_subarray_output_driver         = uca->delay_before_subarray_output_driver;
+    ptr_array->delay_from_subarray_output_driver_to_output = uca->delay_from_subarray_out_drv_to_out;
+
+    ptr_array->delay_route_to_bank          = uca->htree_in_add->delay;
+    ptr_array->delay_input_htree            = uca->bank.htree_in_add->delay;
+    ptr_array->delay_row_predecode_driver_and_block = uca->bank.mat.r_predec->delay;
+    ptr_array->delay_row_decoder            = uca->bank.mat.row_dec->delay;
+    ptr_array->delay_bitlines               = uca->bank.mat.delay_bitline;
+    ptr_array->delay_matchlines               = uca->bank.mat.delay_matchchline;
+    ptr_array->delay_sense_amp              = uca->bank.mat.delay_sa;
+    ptr_array->delay_subarray_output_driver = uca->bank.mat.delay_subarray_out_drv_htree;
+    ptr_array->delay_dout_htree             = uca->bank.htree_out_data->delay;
+    ptr_array->delay_comparator             = uca->bank.mat.delay_comparator;
+
+    ptr_array->all_banks_height = uca->area.h;
+    ptr_array->all_banks_width  = uca->area.w;
+    ptr_array->area_efficiency = uca->area_all_dataramcells * 100 / (uca->area.get_area());
+
+    ptr_array->power_routing_to_bank = uca->power_routing_to_bank;
+    ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power;
+    ptr_array->power_data_input_htree = uca->bank.htree_in_data->power;
+//    cout<<"power_data_input_htree"<<uca->bank.htree_in_data->power.readOp.leakage<<endl;
+    ptr_array->power_data_output_htree = uca->bank.htree_out_data->power;
+//    cout<<"power_data_output_htree"<<uca->bank.htree_out_data->power.readOp.leakage<<endl;
+    ptr_array->power_row_predecoder_drivers = uca->bank.mat.r_predec->driver_power;
+    ptr_array->power_row_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_row_predecoder_blocks = uca->bank.mat.r_predec->block_power;
+    ptr_array->power_row_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders;
+    ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bit_mux_predecoder_drivers = uca->bank.mat.b_mux_predec->driver_power;
+    ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bit_mux_predecoder_blocks  = uca->bank.mat.b_mux_predec->block_power;
+    ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders;
+    ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers = uca->bank.mat.sa_mux_lev_1_predec->driver_power;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks = uca->bank.mat.sa_mux_lev_1_predec->block_power;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_1_decoders = uca->bank.mat.power_sa_mux_lev_1_decoders;
+    ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers = uca->bank.mat.sa_mux_lev_2_predec->driver_power;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks = uca->bank.mat.sa_mux_lev_2_predec->block_power;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_2_decoders = uca->bank.mat.power_sa_mux_lev_2_decoders;
+    ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bitlines = uca->bank.mat.power_bitline;
+    ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_sense_amps = uca->bank.mat.power_sa;
+    ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_prechg_eq_drivers = uca->bank.mat.power_bl_precharge_eq_drv;
+    ptr_array->power_prechg_eq_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_prechg_eq_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_prechg_eq_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_output_drivers_at_subarray = uca->bank.mat.power_subarray_out_drv;
+    ptr_array->power_output_drivers_at_subarray.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_comparators = uca->bank.mat.power_comparator;
+    ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir;
+
+//    cout <<  "  num of mats: " << dyn_p.num_mats << endl;
+    if (is_fa || pure_cam)
+    {
+    ptr_array->power_htree_in_search = uca->bank.htree_in_search->power;
+//    cout<<"power_htree_in_search"<<uca->bank.htree_in_search->power.readOp.leakage<<endl;
+    ptr_array->power_htree_out_search = uca->bank.htree_out_search->power;
+//    cout<<"power_htree_out_search"<<uca->bank.htree_out_search->power.readOp.leakage<<endl;
+    ptr_array->power_searchline = uca->bank.mat.power_searchline;
+//    cout<<"power_searchlineh"<<uca->bank.mat.power_searchline.readOp.leakage<<endl;
+    ptr_array->power_searchline.searchOp.dynamic *= num_mats;
+    ptr_array->power_searchline_precharge = uca->bank.mat.power_searchline_precharge;
+    ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats;
+    ptr_array->power_matchlines = uca->bank.mat.power_matchline;
+    ptr_array->power_matchlines.searchOp.dynamic *= num_mats;
+    ptr_array->power_matchline_precharge = uca->bank.mat.power_matchline_precharge;
+    ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats;
+    ptr_array->power_matchline_to_wordline_drv = uca->bank.mat.power_ml_to_ram_wl_drv;
+//    cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.power_matchline.searchOp.leakage<<endl;
+    }
+
+    ptr_array->activate_energy = uca->activate_energy;
+    ptr_array->read_energy = uca->read_energy;
+    ptr_array->write_energy = uca->write_energy;
+    ptr_array->precharge_energy = uca->precharge_energy;
+    ptr_array->refresh_power = uca->refresh_power;
+    ptr_array->leak_power_subbank_closed_page = uca->leak_power_subbank_closed_page;
+    ptr_array->leak_power_subbank_open_page = uca->leak_power_subbank_open_page;
+    ptr_array->leak_power_request_and_reply_networks = uca->leak_power_request_and_reply_networks;
+
+    ptr_array->precharge_delay = uca->precharge_delay;
+
+
+//      cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.<<endl;
+//
+//    if (!(is_fa || pure_cam))
+//    {
+//     cout <<  "  num of cols: " << dyn_p.num_c_subarray << endl;
+//    }
+//    else if (is_fa)
+//    {
+//       cout <<  "  num of cols: " << dyn_p.tag_num_c_subarray+ dyn_p.data_num_c_subarray<< endl;
+//    } else
+//       cout <<  "  num of cols: " << dyn_p.tag_num_c_subarray<< endl;
+//      cout <<  uca->bank.mat.subarray.get_total_cell_area()<<endl;
+  }
+
+
+  delete uca;
+  return true;
+}
+
+
+
+bool check_uca_org(uca_org_t & u, min_values_t *minval)
+{
+  if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
+    return false;
+  }
+  if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
+      g_ip->dynamic_power_dev) {
+    return false;
+  }
+  if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
+      g_ip->leakage_power_dev) {
+    return false;
+  }
+  if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
+      g_ip->cycle_time_dev) {
+    return false;
+  }
+  if (((u.area - minval->min_area)/minval->min_area)*100 >
+      g_ip->area_dev) {
+    return false;
+  }
+  return true;
+}
+
+bool check_mem_org(mem_array & u, const min_values_t *minval)
+{
+  if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
+    return false;
+  }
+  if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
+      g_ip->dynamic_power_dev) {
+    return false;
+  }
+  if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
+      g_ip->leakage_power_dev) {
+    return false;
+  }
+  if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
+      g_ip->cycle_time_dev) {
+    return false;
+  }
+  if (((u.area - minval->min_area)/minval->min_area)*100 >
+      g_ip->area_dev) {
+    return false;
+  }
+  return true;
+}
+
+
+
+
+void find_optimal_uca(uca_org_t *res, min_values_t * minval, list<uca_org_t> & ulist)
+{
+  double cost = 0;
+  double min_cost = BIGNUM;
+  float d, a, dp, lp, c;
+
+  dp = g_ip->dynamic_power_wt;
+  lp = g_ip->leakage_power_wt;
+  a  = g_ip->area_wt;
+  d  = g_ip->delay_wt;
+  c  = g_ip->cycle_time_wt;
+
+  if (ulist.empty() == true)
+  {
+    cout << "ERROR: no valid cache organizations found" << endl;
+    exit(0);
+  }
+
+  for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end(); niter++)
+  {
+    if (g_ip->ed == 1)
+    {
+      cost = ((niter)->access_time/minval->min_delay) * ((niter)->power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost)
+      {
+        min_cost = cost;
+        *res = (*(niter));
+      }
+    }
+    else if (g_ip->ed == 2)
+    {
+      cost = ((niter)->access_time/minval->min_delay)*
+             ((niter)->access_time/minval->min_delay)*
+             ((niter)->power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost)
+      {
+        min_cost = cost;
+        *res = (*(niter));
+      }
+    }
+    else
+    {
+      /*
+       * check whether the current organization
+       * meets the input deviation constraints
+       */
+      bool v = check_uca_org(*niter, minval);
+      //if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
+
+      if (v)
+      {
+        cost = (d  * ((niter)->access_time/minval->min_delay) +
+                c  * ((niter)->cycle_time/minval->min_cyc) +
+                dp * ((niter)->power.readOp.dynamic/minval->min_dyn) +
+                lp * ((niter)->power.readOp.leakage/minval->min_leakage) +
+                a  * ((niter)->area/minval->min_area));
+        //fprintf(stderr, "cost = %g\n", cost);
+
+        if (min_cost > cost) {
+          min_cost = cost;
+          *res = (*(niter));
+          niter = ulist.erase(niter);
+          if (niter!=ulist.begin())
+                  niter--;
+        }
+      }
+      else {
+        niter = ulist.erase(niter);
+        if (niter!=ulist.begin())
+                niter--;
+      }
+    }
+  }
+
+  if (min_cost == BIGNUM)
+  {
+    cout << "ERROR: no cache organizations met optimization criteria" << endl;
+    exit(0);
+  }
+}
+
+
+
+void filter_tag_arr(const min_values_t * min, list<mem_array *> & list)
+{
+  double cost = BIGNUM;
+  double cur_cost;
+  double wt_delay = g_ip->delay_wt, wt_dyn = g_ip->dynamic_power_wt, wt_leakage = g_ip->leakage_power_wt, wt_cyc = g_ip->cycle_time_wt, wt_area = g_ip->area_wt;
+  mem_array * res = NULL;
+
+  if (list.empty() == true)
+  {
+    cout << "ERROR: no valid tag organizations found" << endl;
+    exit(1);
+  }
+
+
+  while (list.empty() != true)
+  {
+    bool v = check_mem_org(*list.back(), min);
+    if (v)
+    {
+      cur_cost = wt_delay   * (list.back()->access_time/min->min_delay) +
+        wt_dyn     * (list.back()->power.readOp.dynamic/min->min_dyn) +
+        wt_leakage * (list.back()->power.readOp.leakage/min->min_leakage) +
+        wt_area    * (list.back()->area/min->min_area) +
+        wt_cyc     * (list.back()->cycle_time/min->min_cyc);
+    }
+    else
+    {
+      cur_cost = BIGNUM;
+    }
+    if (cur_cost < cost)
+    {
+      if (res != NULL)
+      {
+        delete res;
+      }
+      cost = cur_cost;
+      res  = list.back();
+    }
+    else
+    {
+      delete list.back();
+    }
+    list.pop_back();
+  }
+  if(!res)
+  {
+    cout << "ERROR: no valid tag organizations found" << endl;
+    exit(0);
+  }
+
+  list.push_back(res);
+}
+
+
+
+void filter_data_arr(list<mem_array *> & curr_list)
+{
+  if (curr_list.empty() == true)
+  {
+    cout << "ERROR: no valid data array organizations found" << endl;
+    exit(1);
+  }
+
+  list<mem_array *>::iterator iter;
+
+  for (iter = curr_list.begin(); iter != curr_list.end(); ++iter)
+  {
+    mem_array * m = *iter;
+
+    if (m == NULL) exit(1);
+
+    if(((m->access_time - m->arr_min->min_delay)/m->arr_min->min_delay > 0.5) &&
+       ((m->power.readOp.dynamic - m->arr_min->min_dyn)/m->arr_min->min_dyn > 0.5))
+    {
+      delete m;
+      iter = curr_list.erase(iter);
+      iter --;
+    }
+  }
+}
+
+
+
+/*
+ * Performs exhaustive search across different sub-array sizes,
+ * wire types and aspect ratios to find an optimal UCA organization
+ * 1. First different valid tag array organizations are calculated
+ *    and stored in tag_arr array
+ * 2. The exhaustive search is repeated to find valid data array
+ *    organizations and stored in data_arr array
+ * 3. Cache area, delay, power, and cycle time for different
+ *    cache organizations are calculated based on the
+ *    above results
+ * 4. Cache model with least cost is picked from sol_list
+ */
+void solve(uca_org_t *fin_res)
+{
+  bool   is_dram  = false;
+  int    pure_ram = g_ip->pure_ram;
+  bool   pure_cam = g_ip->pure_cam;
+
+  init_tech_params(g_ip->F_sz_um, false);
+
+
+  list<mem_array *> tag_arr (0);
+  list<mem_array *> data_arr(0);
+  list<mem_array *>::iterator miter;
+  list<uca_org_t> sol_list(1, uca_org_t());
+
+  fin_res->tag_array.access_time = 0;
+  fin_res->tag_array.Ndwl = 0;
+  fin_res->tag_array.Ndbl = 0;
+  fin_res->tag_array.Nspd = 0;
+  fin_res->tag_array.deg_bl_muxing = 0;
+  fin_res->tag_array.Ndsam_lev_1 = 0;
+  fin_res->tag_array.Ndsam_lev_2 = 0;
+
+
+  // distribute calculate_time() execution to multiple threads
+  calc_time_mt_wrapper_struct * calc_array = new calc_time_mt_wrapper_struct[nthreads];
+  pthread_t threads[nthreads];
+
+  for (uint32_t t = 0; t < nthreads; t++)
+  {
+    calc_array[t].tid         = t;
+    calc_array[t].pure_ram    = pure_ram;
+    calc_array[t].pure_cam    = pure_cam;
+    calc_array[t].data_res    = new min_values_t();
+    calc_array[t].tag_res     = new min_values_t();
+  }
+
+  bool     is_tag;
+  uint32_t ram_cell_tech_type;
+
+  // If it's a cache, first calculate the area, delay and power for all tag array partitions.
+  if (!(pure_ram||pure_cam||g_ip->fully_assoc))
+  { //cache
+    is_tag              = true;
+    ram_cell_tech_type  = g_ip->tag_arr_ram_cell_tech_type;
+    is_dram             = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
+    init_tech_params(g_ip->F_sz_um, is_tag);
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].is_tag      = is_tag;
+      calc_array[t].is_main_mem = false;
+      calc_array[t].Nspd_min    = 0.125;
+      pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
+    }
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      pthread_join(threads[t], NULL);
+    }
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].data_arr.sort(mem_array::lt);
+      data_arr.merge(calc_array[t].data_arr, mem_array::lt);
+      calc_array[t].tag_arr.sort(mem_array::lt);
+      tag_arr.merge(calc_array[t].tag_arr, mem_array::lt);
+    }
+  }
+
+
+  // calculate the area, delay and power for all data array partitions (for cache or plain RAM).
+//  if (!g_ip->fully_assoc)
+// {//in the new cacti, cam, fully_associative cache are processed as single array in the data portion
+    is_tag              = false;
+    ram_cell_tech_type  = g_ip->data_arr_ram_cell_tech_type;
+    is_dram             = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
+    init_tech_params(g_ip->F_sz_um, is_tag);
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].is_tag      = is_tag;
+      calc_array[t].is_main_mem = g_ip->is_main_mem;
+      if (!(pure_cam||g_ip->fully_assoc))
+      {
+          calc_array[t].Nspd_min    = (double)(g_ip->out_w)/(double)(g_ip->block_sz*8);
+      }
+      else
+      {
+          calc_array[t].Nspd_min    = 1;
+      }
+
+      pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
+    }
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      pthread_join(threads[t], NULL);
+    }
+
+    data_arr.clear();
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].data_arr.sort(mem_array::lt);
+      data_arr.merge(calc_array[t].data_arr, mem_array::lt);
+    }
+//  }
+
+
+  min_values_t * d_min = new min_values_t();
+  min_values_t * t_min = new min_values_t();
+  min_values_t * cache_min = new min_values_t();
+
+  for (uint32_t t = 0; t < nthreads; t++)
+  {
+    d_min->update_min_values(calc_array[t].data_res);
+    t_min->update_min_values(calc_array[t].tag_res);
+  }
+
+  for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
+  {
+    (*miter)->arr_min = d_min;
+  }
+
+
+  //cout << data_arr.size() << "\t" << tag_arr.size() <<" before\n";
+  filter_data_arr(data_arr);
+  if(!(pure_ram||pure_cam||g_ip->fully_assoc))
+  {
+    filter_tag_arr(t_min, tag_arr);
+  }
+  //cout << data_arr.size() << "\t" << tag_arr.size() <<" after\n";
+
+
+  if (pure_ram||pure_cam||g_ip->fully_assoc)
+  {
+    for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
+    {
+      uca_org_t & curr_org  = sol_list.back();
+      curr_org.tag_array2  = NULL;
+      curr_org.data_array2 = (*miter);
+
+      curr_org.find_delay();
+      curr_org.find_energy();
+      curr_org.find_area();
+      curr_org.find_cyc();
+
+      //update min values for the entire cache
+      cache_min->update_min_values(curr_org);
+
+      sol_list.push_back(uca_org_t());
+    }
+  }
+  else
+  {
+    while (tag_arr.empty() != true)
+    {
+      mem_array * arr_temp = (tag_arr.back());
+      //delete tag_arr.back();
+      tag_arr.pop_back();
+
+      for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
+      {
+        uca_org_t & curr_org  = sol_list.back();
+        curr_org.tag_array2  = arr_temp;
+        curr_org.data_array2 = (*miter);
+
+        curr_org.find_delay();
+        curr_org.find_energy();
+        curr_org.find_area();
+        curr_org.find_cyc();
+
+        //update min values for the entire cache
+        cache_min->update_min_values(curr_org);
+
+        sol_list.push_back(uca_org_t());
+      }
+    }
+  }
+
+  sol_list.pop_back();
+
+  find_optimal_uca(fin_res, cache_min, sol_list);
+
+  sol_list.clear();
+
+  for (miter = data_arr.begin(); miter != data_arr.end(); ++miter)
+  {
+    if (*miter != fin_res->data_array2)
+    {
+      delete *miter;
+    }
+  }
+  data_arr.clear();
+
+  for (uint32_t t = 0; t < nthreads; t++)
+  {
+    delete calc_array[t].data_res;
+    delete calc_array[t].tag_res;
+  }
+
+  delete [] calc_array;
+  delete cache_min;
+  delete d_min;
+  delete t_min;
+}
+
+void update(uca_org_t *fin_res)
+{
+  if(fin_res->tag_array2)
+  {
+    init_tech_params(g_ip->F_sz_um,true);
+    DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam, fin_res->tag_array2->Nspd, fin_res->tag_array2->Ndwl, fin_res->tag_array2->Ndbl, fin_res->tag_array2->Ndcm, fin_res->tag_array2->Ndsam_lev_1, fin_res->tag_array2->Ndsam_lev_2, g_ip->is_main_mem);
+    if(tag_arr_dyn_p.is_valid)
+    {
+      UCA * tag_arr = new UCA(tag_arr_dyn_p);
+      fin_res->tag_array2->power = tag_arr->power;
+    }
+    else
+    {
+      cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
+      exit(1);
+    }
+  }
+  init_tech_params(g_ip->F_sz_um,false);
+  DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam, fin_res->data_array2->Nspd, fin_res->data_array2->Ndwl, fin_res->data_array2->Ndbl, fin_res->data_array2->Ndcm, fin_res->data_array2->Ndsam_lev_1, fin_res->data_array2->Ndsam_lev_2, g_ip->is_main_mem);
+  if(data_arr_dyn_p.is_valid)
+  {
+    UCA * data_arr = new UCA(data_arr_dyn_p);
+    fin_res->data_array2->power = data_arr->power;
+  }
+  else
+  {
+    cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
+    exit(1);
+  }
+
+  fin_res->find_energy();
+}
+
diff --git a/ext/mcpat/cacti/Ucache.h b/ext/mcpat/cacti/Ucache.h
new file mode 100644 (file)
index 0000000..20985ff
--- /dev/null
@@ -0,0 +1,115 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __UCACHE_H__
+#define __UCACHE_H__
+
+#include <list>
+
+#include "area.h"
+#include "nuca.h"
+#include "router.h"
+
+class min_values_t
+{
+  public:
+    double min_delay;
+    double min_dyn;
+    double min_leakage;
+    double min_area;
+    double min_cyc;
+
+    min_values_t() : min_delay(BIGNUM), min_dyn(BIGNUM), min_leakage(BIGNUM), min_area(BIGNUM), min_cyc(BIGNUM) { }
+
+    void update_min_values(const min_values_t * val);
+    void update_min_values(const uca_org_t & res);
+    void update_min_values(const nuca_org_t * res);
+    void update_min_values(const mem_array * res);
+};
+
+
+
+struct solution
+{
+  int    tag_array_index;
+  int    data_array_index;
+  list<mem_array *>::iterator tag_array_iter;
+  list<mem_array *>::iterator data_array_iter;
+  double access_time;
+  double cycle_time;
+  double area;
+  double efficiency;
+  powerDef total_power;
+};
+
+
+
+bool calculate_time(
+    bool is_tag,
+    int pure_ram,
+    bool pure_cam,
+    double Nspd,
+    unsigned int Ndwl,
+    unsigned int Ndbl,
+    unsigned int Ndcm,
+    unsigned int Ndsam_lev_1,
+    unsigned int Ndsam_lev_2,
+    mem_array *ptr_array,
+    int flag_results_populate,
+    results_mem_array *ptr_results,
+    uca_org_t *ptr_fin_res,
+    bool is_main_mem);
+void update(uca_org_t *fin_res);
+
+void solve(uca_org_t *fin_res);
+void init_tech_params(double tech, bool is_tag);
+
+
+struct calc_time_mt_wrapper_struct
+{
+  uint32_t tid;
+  bool     is_tag;
+  bool     pure_ram;
+  bool     pure_cam;
+  bool     is_main_mem;
+  double   Nspd_min;
+
+  min_values_t * data_res;
+  min_values_t * tag_res;
+
+  list<mem_array *> data_arr;
+  list<mem_array *> tag_arr;
+};
+
+void *calc_time_mt_wrapper(void * void_obj);
+
+#endif
diff --git a/ext/mcpat/cacti/arbiter.cc b/ext/mcpat/cacti/arbiter.cc
new file mode 100644 (file)
index 0000000..6664abf
--- /dev/null
@@ -0,0 +1,130 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "arbiter.h"
+
+Arbiter::Arbiter(
+    double n_req,
+    double flit_size_,
+    double output_len,
+    TechnologyParameter::DeviceType *dt
+    ):R(n_req), flit_size(flit_size_),
+    o_len (output_len), deviceType(dt)
+{
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  Vdd = dt->Vdd;
+  double technology = g_ip->F_sz_um;
+  NTn1 = 13.5*technology/2;
+  PTn1 = 76*technology/2;
+  NTn2 = 13.5*technology/2;
+  PTn2 = 76*technology/2;
+  NTi = 12.5*technology/2;
+  PTi = 25*technology/2;
+  NTtr = 10*technology/2; /*Transmission gate's nmos tr. length*/
+  PTtr = 20*technology/2; /* pmos tr. length*/
+}
+
+Arbiter::~Arbiter(){}
+
+double
+Arbiter::arb_req() {
+  double temp = ((R-1)*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)) + 2*gate_C(NTn2, 0) +
+      gate_C(PTn2, 0) + gate_C(NTi, 0) + gate_C(PTi, 0) +
+      drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def));
+  return temp;
+}
+
+double
+Arbiter::arb_pri() {
+  double temp = 2*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)); /* switching capacitance
+                                                 of flip-flop is ignored */
+  return temp;
+}
+
+
+double
+Arbiter::arb_grant() {
+  double temp = drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + crossbar_ctrline();
+  return temp;
+}
+
+double
+Arbiter::arb_int() {
+  double temp  =  (drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) +
+      2*gate_C(NTn2, 0) + gate_C(PTn2, 0));
+  return temp;
+}
+
+void
+Arbiter::compute_power() {
+  power.readOp.dynamic =  (R*arb_req()*Vdd*Vdd/2 + R*arb_pri()*Vdd*Vdd/2 +
+      arb_grant()*Vdd*Vdd + arb_int()*0.5*Vdd*Vdd);
+  double nor1_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
+  double nor2_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
+  double not_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
+  double nor1_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
+  double nor2_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
+  double not_leak_gate  = cmos_Ig_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
+  power.readOp.leakage = (nor1_leak + nor2_leak + not_leak)*Vdd; //FIXME include priority table leakage
+  power.readOp.gate_leakage = nor1_leak_gate*Vdd + nor2_leak_gate*Vdd + not_leak_gate*Vdd;
+}
+
+double //wire cap with triple spacing
+Arbiter::Cw3(double length) {
+  Wire wc(g_ip->wt, length, 1, 3, 3);
+  double temp = (wc.wire_cap(length,true));
+  return temp;
+}
+
+double
+Arbiter::crossbar_ctrline() {
+  double temp = (Cw3(o_len * 1e-6 /* m */) +
+      drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def) +
+      gate_C(NTi, 0) + gate_C(PTi, 0));
+  return temp;
+}
+
+double
+Arbiter::transmission_buf_ctrcap() {
+  double temp = gate_C(NTtr, 0)+gate_C(PTtr, 0);
+  return temp;
+}
+
+
+void Arbiter::print_arbiter()
+{
+  cout << "\nArbiter Stats ("   << R << " input arbiter" << ")\n\n";
+  cout << "Flit size        : " << flit_size << " bits" << endl;
+  cout << "Dynamic Power    : " << power.readOp.dynamic*1e9 << " (nJ)" << endl;
+  cout << "Leakage Power    : " << power.readOp.leakage*1e3 << " (mW)" << endl;
+}
+
+
diff --git a/ext/mcpat/cacti/arbiter.h b/ext/mcpat/cacti/arbiter.h
new file mode 100644 (file)
index 0000000..32ada92
--- /dev/null
@@ -0,0 +1,79 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef __ARBITER__
+#define __ARBITER__
+
+#include <assert.h>
+
+#include <iostream>
+
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "mat.h"
+#include "parameter.h"
+#include "wire.h"
+
+class Arbiter : public Component
+{
+  public:
+    Arbiter(
+      double Req,
+      double flit_sz,
+      double output_len,
+      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Arbiter();
+
+    void print_arbiter();
+    double arb_req();
+    double arb_pri();
+    double arb_grant();
+    double arb_int();
+    void compute_power();
+    double Cw3(double len);
+    double crossbar_ctrline();
+    double transmission_buf_ctrcap();
+
+
+
+  private:
+    double NTn1, PTn1, NTn2, PTn2, R, PTi, NTi;
+    double flit_size;
+    double NTtr, PTtr;
+    double o_len;
+    TechnologyParameter::DeviceType *deviceType;
+    double TriS1, TriS2;
+    double min_w_pmos, Vdd;
+
+};
+
+#endif
diff --git a/ext/mcpat/cacti/area.cc b/ext/mcpat/cacti/area.cc
new file mode 100644 (file)
index 0000000..14ea4a9
--- /dev/null
@@ -0,0 +1,47 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "area.h"
+#include "basic_circuit.h"
+#include "component.h"
+#include "decoder.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+
diff --git a/ext/mcpat/cacti/area.h b/ext/mcpat/cacti/area.h
new file mode 100644 (file)
index 0000000..7705e62
--- /dev/null
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __AREA_H__
+#define __AREA_H__
+
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+
+using namespace std;
+
+class Area
+{
+ public:
+  double w;
+  double h;
+
+  Area():w(0), h(0), area(0) { }
+  double get_w() const { return w; }
+  double get_h() const { return h; }
+  double get_area() const
+  {
+    if (w == 0 && h == 0)
+    {
+      return area;
+    }
+    else
+    {
+      return w*h;
+    }
+  }
+  void set_w(double w_) { w = w_; }
+  void set_h(double h_) { h = h_; }
+  void set_area(double a_) { area = a_; }
+
+ private:
+  double area;
+};
+
+#endif
+
diff --git a/ext/mcpat/cacti/bank.cc b/ext/mcpat/cacti/bank.cc
new file mode 100755 (executable)
index 0000000..a18c7f1
--- /dev/null
@@ -0,0 +1,198 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <iostream>
+
+#include "bank.h"
+
+Bank::Bank(const DynamicParameter & dyn_p):
+  dp(dyn_p), mat(dp),
+  num_addr_b_mat(dyn_p.number_addr_bits_mat),
+  num_mats_hor_dir(dyn_p.num_mats_h_dir), num_mats_ver_dir(dyn_p.num_mats_v_dir)
+{
+  int RWP;
+  int ERP;
+  int EWP;
+  int SCHP;
+
+  if (dp.use_inp_params)
+  {
+    RWP  = dp.num_rw_ports;
+    ERP  = dp.num_rd_ports;
+    EWP  = dp.num_wr_ports;
+    SCHP = dp.num_search_ports;
+  }
+  else
+  {
+    RWP  = g_ip->num_rw_ports;
+    ERP  = g_ip->num_rd_ports;
+    EWP  = g_ip->num_wr_ports;
+    SCHP = g_ip->num_search_ports;
+  }
+
+  int total_addrbits = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
+  int datainbits     = dp.num_di_b_bank_per_port * (RWP + EWP);
+  int dataoutbits    = dp.num_do_b_bank_per_port * (RWP + ERP);
+  int searchinbits;
+  int searchoutbits;
+
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+          datainbits   = dp.num_di_b_bank_per_port * (RWP + EWP);
+          dataoutbits  = dp.num_do_b_bank_per_port * (RWP + ERP);
+          searchinbits    = dp.num_si_b_bank_per_port * SCHP;
+          searchoutbits   = dp.num_so_b_bank_per_port * SCHP;
+  }
+
+  if (!(dp.fully_assoc || dp.pure_cam))
+    {
+    if (g_ip->fast_access && dp.is_tag == false)
+    {
+        dataoutbits *= g_ip->data_assoc;
+    }
+
+  htree_in_add   = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
+  htree_in_data  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
+  htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
+
+//  htree_out_data = new Htree2 (g_ip->wt,(double) 100, (double)100,
+//               total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
+
+  area.w = htree_in_data->area.w;
+  area.h = htree_in_data->area.h;
+  }
+  else
+  {
+          htree_in_add   = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+                          total_addrbits, datainbits, searchinbits,dataoutbits,searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
+          htree_in_data  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
+          htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
+          htree_in_search  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree,true, true);
+          htree_out_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree,true);
+
+      area.w = htree_in_data->area.w;
+      area.h = htree_in_data->area.h;
+  }
+
+  num_addr_b_row_dec = _log2(mat.subarray.num_rows);
+  num_addr_b_routed_to_mat_for_act = num_addr_b_row_dec;
+  num_addr_b_routed_to_mat_for_rd_or_wr = num_addr_b_mat - num_addr_b_row_dec;
+}
+
+
+
+Bank::~Bank()
+{
+  delete htree_in_add;
+  delete htree_out_data;
+  delete htree_in_data;
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+          delete htree_in_search;
+          delete htree_out_search;
+  }
+}
+
+
+
+double Bank::compute_delays(double inrisetime)
+{
+  return mat.compute_delays(inrisetime);
+}
+
+
+
+void Bank::compute_power_energy()
+{
+  mat.compute_power_energy();
+
+  if (!(dp.fully_assoc || dp.pure_cam))
+  {
+          power.readOp.dynamic += mat.power.readOp.dynamic * dp.num_act_mats_hor_dir;
+          power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
+          power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
+
+          power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
+          power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
+
+          power.readOp.leakage += htree_in_add->power.readOp.leakage;
+          power.readOp.leakage += htree_in_data->power.readOp.leakage;
+          power.readOp.leakage += htree_out_data->power.readOp.leakage;
+          power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
+          power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
+          power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
+  }
+  else
+  {
+
+          power.readOp.dynamic += mat.power.readOp.dynamic ;//for fa and cam num_act_mats_hor_dir is 1 for plain r/w
+          power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
+          power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
+
+          power.searchOp.dynamic += mat.power.searchOp.dynamic * dp.num_mats;
+          power.searchOp.dynamic += mat.power_bl_precharge_eq_drv.searchOp.dynamic +
+                                        mat.power_sa.searchOp.dynamic +
+                                        mat.power_bitline.searchOp.dynamic +
+                                        mat.power_subarray_out_drv.searchOp.dynamic+
+                                        mat.ml_to_ram_wl_drv->power.readOp.dynamic;
+
+          power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
+          power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
+
+          power.searchOp.dynamic += htree_in_search->power.searchOp.dynamic;
+          power.searchOp.dynamic += htree_out_search->power.searchOp.dynamic;
+
+          power.readOp.leakage += htree_in_add->power.readOp.leakage;
+          power.readOp.leakage += htree_in_data->power.readOp.leakage;
+          power.readOp.leakage += htree_out_data->power.readOp.leakage;
+          power.readOp.leakage += htree_in_search->power.readOp.leakage;
+          power.readOp.leakage += htree_out_search->power.readOp.leakage;
+
+
+          power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
+          power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
+          power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
+          power.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage;
+          power.readOp.gate_leakage += htree_out_search->power.readOp.gate_leakage;
+
+  }
+
+}
+
diff --git a/ext/mcpat/cacti/bank.h b/ext/mcpat/cacti/bank.h
new file mode 100755 (executable)
index 0000000..153609a
--- /dev/null
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __BANK_H__
+#define __BANK_H__
+
+#include "component.h"
+#include "decoder.h"
+#include "htree2.h"
+#include "mat.h"
+
+class Bank : public Component
+{
+  public:
+    Bank(const DynamicParameter & dyn_p);
+    ~Bank();
+    double compute_delays(double inrisetime);  // return outrisetime
+    void   compute_power_energy();
+
+    const DynamicParameter & dp;
+    Mat   mat;
+    Htree2 *htree_in_add;
+    Htree2 *htree_in_data;
+    Htree2 *htree_out_data;
+    Htree2 *htree_in_search;
+    Htree2 *htree_out_search;
+
+    int  num_addr_b_mat;
+    int  num_mats_hor_dir;
+    int  num_mats_ver_dir;
+
+    int  num_addr_b_row_dec;
+    int  num_addr_b_routed_to_mat_for_act;
+    int  num_addr_b_routed_to_mat_for_rd_or_wr;
+};
+
+
+
+#endif
diff --git a/ext/mcpat/cacti/basic_circuit.cc b/ext/mcpat/cacti/basic_circuit.cc
new file mode 100644 (file)
index 0000000..6efd5dd
--- /dev/null
@@ -0,0 +1,829 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "basic_circuit.h"
+#include "parameter.h"
+
+uint32_t _log2(uint64_t num)
+{
+  uint32_t log2 = 0;
+
+  if (num == 0)
+  {
+    std::cerr << "log0?" << std::endl;
+    exit(1);
+  }
+
+  while (num > 1)
+  {
+    num = (num >> 1);
+    log2++;
+  }
+
+  return log2;
+}
+
+
+bool is_pow2(int64_t val)
+{
+  if (val <= 0)
+  {
+    return false;
+  }
+  else if (val == 1)
+  {
+    return true;
+  }
+  else
+  {
+    return (_log2(val) != _log2(val-1));
+  }
+}
+
+
+int powers (int base, int n)
+{
+  int i, p;
+
+  p = 1;
+  for (i = 1; i <= n; ++i)
+    p *= base;
+  return p;
+}
+
+/*----------------------------------------------------------------------*/
+
+double logtwo (double x)
+{
+  assert(x > 0);
+  return ((double) (log (x) / log (2.0)));
+}
+
+/*----------------------------------------------------------------------*/
+
+
+double gate_C(
+    double width,
+    double wirelength,
+    bool   _is_dram,
+    bool   _is_cell,
+    bool   _is_wl_tr)
+{
+  const TechnologyParameter::DeviceType * dt;
+
+  if (_is_dram && _is_cell)
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if (_is_dram && _is_wl_tr)
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if (!_is_dram && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
+}
+
+
+// returns gate capacitance in Farads
+// actually this function is the same as gate_C() now
+double gate_C_pass(
+    double width,       // gate width in um (length is Lphy_periph_global)
+    double wirelength,  // poly wire length going to gate in lambda
+    bool   _is_dram,
+    bool   _is_cell,
+    bool   _is_wl_tr)
+{
+  // v5.0
+  const TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if ((!_is_dram) && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
+}
+
+
+
+double drain_C_(
+    double width,
+    int nchannel,
+    int stack,
+    int next_arg_thresh_folding_width_or_height_cell,
+    double fold_dimension,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  double w_folded_tr;
+  const  TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   // DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    // DRAM wordline transistor
+  }
+  else if ((!_is_dram) && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  double c_junc_area = dt->C_junc;
+  double c_junc_sidewall = dt->C_junc_sidewall;
+  double c_fringe    = 2*dt->C_fringe;
+  double c_overlap   = 2*dt->C_overlap;
+  double drain_C_metal_connecting_folded_tr = 0;
+
+  // determine the width of the transistor after folding (if it is getting folded)
+  if (next_arg_thresh_folding_width_or_height_cell == 0)
+  { // interpret fold_dimension as the the folding width threshold
+    // i.e. the value of transistor width above which the transistor gets folded
+    w_folded_tr = fold_dimension;
+  }
+  else
+  { // interpret fold_dimension as the height of the cell that this transistor is part of.
+    double h_tr_region  = fold_dimension - 2 * g_tp.HPOWERRAIL;
+    // TODO : w_folded_tr must come from Component::compute_gate_area()
+    double ratio_p_to_n = 2.0 / (2.0 + 1.0);
+    if (nchannel)
+    {
+      w_folded_tr = (1 - ratio_p_to_n) * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
+    }
+    else
+    {
+      w_folded_tr = ratio_p_to_n * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
+    }
+  }
+  int num_folded_tr = (int) (ceil(width / w_folded_tr));
+
+  if (num_folded_tr < 2)
+  {
+    w_folded_tr = width;
+  }
+
+  double total_drain_w = (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +  // only for drain
+                         (stack - 1) * g_tp.spacing_poly_to_poly;
+  double drain_h_for_sidewall = w_folded_tr;
+  double total_drain_height_for_cap_wrt_gate = w_folded_tr + 2 * w_folded_tr * (stack - 1);
+  if (num_folded_tr > 1)
+  {
+    total_drain_w += (num_folded_tr - 2) * (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +
+                     (num_folded_tr - 1) * ((stack - 1) * g_tp.spacing_poly_to_poly);
+
+    if (num_folded_tr%2 == 0)
+    {
+      drain_h_for_sidewall = 0;
+    }
+    total_drain_height_for_cap_wrt_gate *= num_folded_tr;
+    drain_C_metal_connecting_folded_tr   = g_tp.wire_local.C_per_um * total_drain_w;
+  }
+
+  double drain_C_area     = c_junc_area * total_drain_w * w_folded_tr;
+  double drain_C_sidewall = c_junc_sidewall * (drain_h_for_sidewall + 2 * total_drain_w);
+  double drain_C_wrt_gate = (c_fringe + c_overlap) * total_drain_height_for_cap_wrt_gate;
+
+  return (drain_C_area + drain_C_sidewall + drain_C_wrt_gate + drain_C_metal_connecting_folded_tr);
+}
+
+
+double tr_R_on(
+    double width,
+    int nchannel,
+    int stack,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  const TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if ((!_is_dram) && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
+  return (stack * restrans / width);
+}
+
+
+/* This routine operates in reverse: given a resistance, it finds
+ * the transistor width that would have this R.  It is used in the
+ * data wordline to estimate the wordline driver size. */
+
+// returns width in um
+double R_to_w(
+    double res,
+    int   nchannel,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  const TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if ((!_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
+  return (restrans / res);
+}
+
+
+double pmos_to_nmos_sz_ratio(
+    bool _is_dram,
+    bool _is_wl_tr)
+{
+  double p_to_n_sizing_ratio;
+  if ((_is_dram) && (_is_wl_tr))
+  { //DRAM wordline transistor
+    p_to_n_sizing_ratio = g_tp.dram_wl.n_to_p_eff_curr_drv_ratio;
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    p_to_n_sizing_ratio = g_tp.peri_global.n_to_p_eff_curr_drv_ratio;
+  }
+  return p_to_n_sizing_ratio;
+}
+
+
+// "Timing Models for MOS Circuits" by Mark Horowitz, 1984
+double horowitz(
+    double inputramptime, // input rise time
+    double tf,            // time constant of gate
+    double vs1,           // threshold voltage
+    double vs2,           // threshold voltage
+    int    rise)          // whether input rises or fall
+{
+  if (inputramptime == 0 && vs1 == vs2)
+  {
+    return tf * (vs1 < 1 ? -log(vs1) : log(vs1));
+  }
+  double a, b, td;
+
+  a = inputramptime / tf;
+  if (rise == RISE)
+  {
+    b = 0.5;
+    td = tf * sqrt(log(vs1)*log(vs1) + 2*a*b*(1.0 - vs1)) + tf*(log(vs1) - log(vs2));
+  }
+  else
+  {
+    b = 0.4;
+    td = tf * sqrt(log(1.0 - vs1)*log(1.0 - vs1) + 2*a*b*(vs1)) + tf*(log(1.0 - vs1) - log(1.0 - vs2));
+  }
+  return (td);
+}
+
+double cmos_Ileak(
+    double nWidth,
+    double pWidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return nWidth*dt->I_off_n + pWidth*dt->I_off_p;
+}
+
+
+double simplified_nmos_leakage(
+    double nwidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return nwidth * dt->I_off_n;
+}
+
+int factorial(int n, int m)
+{
+        int fa = m, i;
+        for (i=m+1; i<=n; i++)
+                fa *=i;
+        return fa;
+}
+
+int combination(int n, int m)
+{
+  int ret;
+  ret = factorial(n, m+1) / factorial(n - m);
+  return ret;
+}
+
+double simplified_pmos_leakage(
+    double pwidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return pwidth * dt->I_off_p;
+}
+
+double cmos_Ig_n(
+    double nWidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return nWidth*dt->I_g_on_n;
+}
+
+double cmos_Ig_p(
+    double pWidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return pWidth*dt->I_g_on_p;
+}
+
+double cmos_Isub_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr,
+    enum Half_net_topology topo)
+{
+        assert (fanin>=1);
+        double nmos_leak = simplified_nmos_leakage(nWidth, _is_dram, _is_cell, _is_wl_tr);
+        double pmos_leak = simplified_pmos_leakage(pWidth, _is_dram, _is_cell, _is_wl_tr);
+    double Isub=0;
+    int    num_states;
+    int    num_off_tx;
+
+    num_states = int(pow(2.0, fanin));
+
+    switch (g_type)
+    {
+    case nmos:
+        if (fanin==1)
+        {
+                Isub = nmos_leak/num_states;
+        }
+        else
+        {
+                if (topo==parallel)
+                {
+                        Isub=nmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
+                }
+                else
+                {
+                        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
+                        {
+                                //Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+                                Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+                        }
+                        Isub /=num_states;
+                }
+
+        }
+        break;
+    case pmos:
+        if (fanin==1)
+        {
+                Isub = pmos_leak/num_states;
+        }
+        else
+        {
+                if (topo==parallel)
+                {
+                        Isub=pmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
+                }
+                else
+                {
+                        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
+                        {
+                                //Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+                                Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+                        }
+                        Isub /=num_states;
+                }
+
+        }
+        break;
+    case inv:
+        Isub = (nmos_leak + pmos_leak)/2;
+        break;
+    case nand:
+        Isub += fanin*pmos_leak;//the pullup network
+        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pulldown network
+        {
+                //Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+            Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+        }
+        Isub /=num_states;
+        break;
+    case nor:
+        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pullup network
+        {
+                //Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+                Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+        }
+        Isub += fanin*nmos_leak;//the pulldown network
+        Isub /=num_states;
+        break;
+    case tri:
+        Isub += (nmos_leak + pmos_leak)/2;//enabled
+        Isub += nmos_leak*UNI_LEAK_STACK_FACTOR; //disabled upper bound of leakage power
+        Isub /=2;
+        break;
+    case tg:
+        Isub = (nmos_leak + pmos_leak)/2;
+        break;
+    default:
+        assert(0);
+        break;
+          }
+
+    return Isub;
+}
+
+
+double cmos_Ig_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr,
+    enum Half_net_topology topo)
+{
+        assert (fanin>=1);
+                double nmos_leak = cmos_Ig_n(nWidth, _is_dram, _is_cell, _is_wl_tr);
+                double pmos_leak = cmos_Ig_p(pWidth, _is_dram, _is_cell, _is_wl_tr);
+            double Ig_on=0;
+            int    num_states;
+            int    num_on_tx;
+
+            num_states = int(pow(2.0, fanin));
+
+            switch (g_type)
+            {
+            case nmos:
+                if (fanin==1)
+                {
+                        Ig_on = nmos_leak/num_states;
+                }
+                else
+                {
+                        if (topo==parallel)
+                        {
+                        for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
+                        {
+                                Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+                        }
+                        }
+                        else
+                        {
+                                Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
+                            //num_on_tx is the number of on tx
+                                for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
+                                {
+                                        Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
+                                }
+                                Ig_on /=num_states;
+                        }
+                }
+                break;
+            case pmos:
+                if (fanin==1)
+                {
+                        Ig_on = pmos_leak/num_states;
+                }
+                else
+                {
+                        if (topo==parallel)
+                    {
+                  for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
+                  {
+                          Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+                  }
+                    }
+                    else
+                    {
+                          Ig_on += pmos_leak * fanin;//pull down network when all TXs are on.
+                      //num_on_tx is the number of on tx
+                          for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
+                          {
+                                  Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
+                          }
+                          Ig_on /=num_states;
+                    }
+                }
+                break;
+
+            case inv:
+                Ig_on = (nmos_leak + pmos_leak)/2;
+                break;
+            case nand:
+                //pull up network
+                for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
+                {
+                        Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+                }
+
+                //pull down network
+                Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
+                //num_on_tx is the number of on tx
+                for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
+                {
+                        Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
+                }
+                Ig_on /=num_states;
+                break;
+            case nor:
+                // num_on_tx is the number of on tx in pull up network
+                Ig_on += pmos_leak * fanin;//pull up network when all TXs are on.
+                for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)
+                {
+                        Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;
+
+                }
+                //pull down network
+                for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
+                {
+                        Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+                }
+                Ig_on /=num_states;
+                break;
+            case tri:
+                Ig_on += (2*nmos_leak + 2*pmos_leak)/2;//enabled
+                Ig_on += (nmos_leak + pmos_leak)/2; //disabled upper bound of leakage power
+                Ig_on /=2;
+                break;
+            case tg:
+                Ig_on = (nmos_leak + pmos_leak)/2;
+                break;
+            default:
+                assert(0);
+                break;
+                  }
+
+            return Ig_on;
+}
+
+double shortcircuit_simple(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd)
+{
+
+        double p_short_circuit, p_short_circuit_discharge, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
+        double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
+
+        fo_n   = i_on_n/i_on_n_in;
+        fo_p   = i_on_p/i_on_p_in;
+        fanout = c_out/c_in;
+        beta_ratio = i_on_p/i_on_n;
+        vt_to_vdd_ratio = vt/vdd;
+
+        //p_short_circuit_discharge_low        = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
+        p_short_circuit_discharge_low  = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
+        p_short_circuit_charge_low             = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
+//     double t1, t2, t3, t4, t5;
+//     t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
+//     t2=pow(velocity_index,2.0);
+//     t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
+//     t4=t1/t2/t3;
+//     cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
+
+        p_short_circuit_discharge_high         = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
+        p_short_circuit_charge_high    = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
+
+//     t1=pow(((vdd-vt)-vt_to_vdd_ratio),1.5);
+//     t2=pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
+//     t3=t1/t2;
+//     cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
+//     p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
+//     p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); //harmmoic mean cannot be applied simple formulas.
+
+        p_short_circuit_discharge = p_short_circuit_discharge_low;
+        p_short_circuit_charge = p_short_circuit_charge_low;
+        p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
+
+  return (p_short_circuit);
+}
+
+double shortcircuit(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd)
+{
+
+        double p_short_circuit=0, p_short_circuit_discharge;//, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
+        double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
+        double f_alpha, k_v, e, g_v_alpha, h_v_alpha;
+
+        fo_n           = i_on_n/i_on_n_in;
+        fo_p           = i_on_p/i_on_p_in;
+        fanout         = 1;
+        beta_ratio     = i_on_p/i_on_n;
+        vt_to_vdd_ratio = vt/vdd;
+        e                      =       2.71828;
+        f_alpha                =       1/(velocity_index+2) -velocity_index/(2*(velocity_index+3)) +velocity_index/(velocity_index+4)*(velocity_index/2-1);
+        k_v                    =       0.9/0.8+(vdd-vt)/0.8*log(10*(vdd-vt)/e);
+        g_v_alpha      =       (velocity_index + 1)*pow((1-velocity_index),velocity_index)*pow((1-velocity_index),velocity_index/2)/f_alpha/pow((1-velocity_index-velocity_index),(velocity_index/2+velocity_index+2));
+        h_v_alpha      =   pow(2, velocity_index)*(velocity_index+1)*pow((1-velocity_index),velocity_index)/pow((1-velocity_index-velocity_index),(velocity_index+1));
+
+        //p_short_circuit_discharge_low        = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
+//     p_short_circuit_discharge_low   = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
+//     p_short_circuit_charge_low              = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
+//     double t1, t2, t3, t4, t5;
+//     t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
+//     t2=pow(velocity_index,2.0);
+//     t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
+//     t4=t1/t2/t3;
+//
+//     cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
+//
+//
+//     p_short_circuit_discharge_high  = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
+//     p_short_circuit_charge_high     = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
+//
+//     p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
+//     p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high);
+//
+//     p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
+//
+//     p_short_circuit = p_short_circuit_discharge;
+
+        p_short_circuit_discharge = k_v*vdd*vdd*c_in*fo_p*fo_p/((vdd-vt)*g_v_alpha*fanout*beta_ratio/2/k_v + h_v_alpha*fo_p);
+  return (p_short_circuit);
+}
diff --git a/ext/mcpat/cacti/basic_circuit.h b/ext/mcpat/cacti/basic_circuit.h
new file mode 100644 (file)
index 0000000..aaab6c0
--- /dev/null
@@ -0,0 +1,248 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __BASIC_CIRCUIT_H__
+#define __BASIC_CIRCUIT_H__
+
+#include "cacti_interface.h"
+#include "const.h"
+
+using namespace std;
+
+#define UNI_LEAK_STACK_FACTOR 0.43
+
+int powers (int base, int n);
+bool is_pow2(int64_t val);
+uint32_t _log2(uint64_t num);
+int factorial(int n, int m = 1);
+int combination(int n, int m);
+
+//#define DBG
+#ifdef DBG
+    #define PRINTDW(a);\
+    a;
+#else
+    #define PRINTDW(a);\
+
+#endif
+
+
+enum Wire_placement {
+    outside_mat,
+    inside_mat,
+    local_wires
+};
+
+
+
+enum Htree_type {
+    Add_htree,
+    Data_in_htree,
+    Data_out_htree,
+    Search_in_htree,
+    Search_out_htree,
+};
+
+enum Gate_type {
+    nmos,
+    pmos,
+        inv,
+    nand,
+    nor,
+    tri,
+    tg
+};
+
+enum Half_net_topology {
+    parallel,
+    series
+};
+
+double logtwo (double x);
+
+double gate_C(
+    double width,
+    double wirelength,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double gate_C_pass(
+    double width,
+    double wirelength,
+    bool   _is_dram = false,
+    bool   _is_sram = false,
+    bool   _is_wl_tr = false);
+
+double drain_C_(
+    double width,
+    int nchannel,
+    int stack,
+    int next_arg_thresh_folding_width_or_height_cell,
+    double fold_dimension,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double tr_R_on(
+    double width,
+    int nchannel,
+    int stack,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double R_to_w(
+    double res,
+    int nchannel,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double horowitz (
+    double inputramptime,
+    double tf,
+    double vs1,
+    double vs2,
+    int rise);
+
+double pmos_to_nmos_sz_ratio(
+    bool _is_dram = false,
+    bool _is_wl_tr = false);
+
+double simplified_nmos_leakage(
+    double nwidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false);
+
+double simplified_pmos_leakage(
+    double pwidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false);
+
+
+double cmos_Ileak(
+    double nWidth,
+    double pWidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false);
+
+double cmos_Ig_n(
+    double nWidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr= false);
+
+double cmos_Ig_p(
+    double pWidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr= false);
+
+
+double cmos_Isub_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false,
+    enum Half_net_topology topo = series);
+
+double cmos_Ig_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false,
+    enum Half_net_topology topo = series);
+
+double shortcircuit(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd);
+
+double shortcircuit_simple(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd);
+//set power point product mask; strictly speaking this is not real point product
+inline void set_pppm(
+        double * pppv,
+        double a=1,
+    double b=1,
+    double c=1,
+    double d=1
+    ){
+                pppv[0]= a;
+                pppv[1]= b;
+                pppv[2]= c;
+                pppv[3]= d;
+
+}
+
+inline void set_sppm(
+        double * sppv,
+        double a=1,
+    double b=1,
+    double c=1,
+    double d=1
+    ){
+                sppv[0]= a;
+                sppv[1]= b;
+                sppv[2]= c;
+}
+
+#endif
diff --git a/ext/mcpat/cacti/batch_tests b/ext/mcpat/cacti/batch_tests
new file mode 100755 (executable)
index 0000000..45a0389
--- /dev/null
@@ -0,0 +1,41 @@
+rm -rf ./out.csv
+./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
diff --git a/ext/mcpat/cacti/cache.cfg b/ext/mcpat/cacti/cache.cfg
new file mode 100755 (executable)
index 0000000..03de34a
--- /dev/null
@@ -0,0 +1,175 @@
+# Cache size
+//-size (bytes) 2048
+//-size (bytes) 4096
+//-size (bytes) 32768
+//-size (bytes) 262144
+//-size (bytes) 1048576
+//-size (bytes) 2097152
+//-size (bytes) 4194304
+//-size (bytes) 8388608
+//-size (bytes) 16777216
+//-size (bytes) 33554432
+//-size (bytes) 134217728
+//-size (bytes) 67108864
+-size (bytes) 1073741824
+
+# Line size
+//-block size (bytes) 8
+-block size (bytes) 64
+
+# To model Fully Associative cache, set associativity to zero
+//-associativity 0
+//-associativity 2
+//-associativity 4
+-associativity 8
+//-associativity 16
+
+-read-write port 1
+-exclusive read port 0
+-exclusive write port 0
+-single ended read ports 0
+
+# Multiple banks connected using a bus
+-UCA bank count 1
+-technology (u) 0.022
+//-technology (u) 0.040
+//-technology (u) 0.032
+//-technology (u) 0.090
+
+# following three parameters are meaningful only for main memories
+
+-page size (bits) 8192 
+-burst length 8
+-internal prefetch width 8
+
+# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
+-Data array cell type - "itrs-hp"
+//-Data array cell type - "itrs-lstp"
+//-Data array cell type - "itrs-lop"
+
+# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
+-Data array peripheral type - "itrs-hp"
+//-Data array peripheral type - "itrs-lstp"
+//-Data array peripheral type - "itrs-lop"
+
+# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
+-Tag array cell type - "itrs-hp"
+//-Tag array cell type - "itrs-lstp"
+//-Tag array cell type - "itrs-lop"
+
+# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
+-Tag array peripheral type - "itrs-hp"
+//-Tag array peripheral type - "itrs-lstp"
+//-Tag array peripheral type - "itrs-lop
+
+# Bus width include data bits and address bits required by the decoder
+//-output/input bus width 16
+-output/input bus width 512
+
+// 300-400 in steps of 10
+-operating temperature (K) 360
+
+# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file) 
+# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
+-cache type "cache"
+//-cache type "ram"
+//-cache type "main memory"
+
+# to model special structure like branch target buffers, directory, etc. 
+# change the tag size parameter
+# if you want cacti to calculate the tagbits, set the tag size to "default"
+-tag size (b) "default"
+//-tag size (b) 22
+
+# fast - data and tag access happen in parallel
+# sequential - data array is accessed after accessing the tag array
+# normal - data array lookup and tag access happen in parallel
+#          final data block is broadcasted in data array h-tree 
+#          after getting the signal from the tag array
+//-access mode (normal, sequential, fast) - "fast"
+-access mode (normal, sequential, fast) - "normal"
+//-access mode (normal, sequential, fast) - "sequential"
+
+
+# DESIGN OBJECTIVE for UCA (or banks in NUCA)
+-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
+
+# Percentage deviation from the minimum value 
+# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
+# that compromises at most 10% delay. 
+# NOTE: Try reasonable values for % deviation. Inconsistent deviation
+# percentage values will not produce any valid organizations. For example,
+# 0:0:100:100:100 will try to identify an organization that has both
+# least delay and dynamic power. Since such an organization is not possible, CACTI will
+# throw an error. Refer CACTI-6 Technical report for more details
+-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
+
+# Objective for NUCA
+-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
+-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
+
+# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
+# energy-delay or energy-delay sq. product
+# Note: Optimize tag will disable weight or deviate values mentioned above
+# Set it to NONE to let weight and deviate values determine the 
+# appropriate cache configuration
+//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
+-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
+//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
+
+-Cache model (NUCA, UCA)  - "UCA"
+//-Cache model (NUCA, UCA)  - "NUCA"
+
+# In order for CACTI to find the optimal NUCA bank value the following
+# variable should be assigned 0.
+-NUCA bank count 0
+
+# NOTE: for nuca network frequency is set to a default value of 
+# 5GHz in time.c. CACTI automatically
+# calculates the maximum possible frequency and downgrades this value if necessary
+
+# By default CACTI considers both full-swing and low-swing 
+# wires to find an optimal configuration. However, it is possible to 
+# restrict the search space by changing the signalling from "default" to 
+# "fullswing" or "lowswing" type.
+//-Wire signalling (fullswing, lowswing, default) - "Global_10"
+-Wire signalling (fullswing, lowswing, default) - "default"
+//-Wire signalling (fullswing, lowswing, default) - "lowswing"
+
+//-Wire inside mat - "global"
+-Wire inside mat - "semi-global"
+//-Wire outside mat - "global"
+-Wire outside mat - "semi-global"
+
+//-Interconnect projection - "conservative"
+-Interconnect projection - "aggressive"
+
+# Contention in network (which is a function of core count and cache level) is one of
+# the critical factor used for deciding the optimal bank count value
+# core count can be 4, 8, or 16
+//-Core count 4
+-Core count 8
+//-Core count 16
+-Cache level (L2/L3) - "L3"
+
+-Add ECC - "true"
+
+//-Print level (DETAILED, CONCISE) - "CONCISE"
+-Print level (DETAILED, CONCISE) - "DETAILED"
+
+# for debugging
+//-Print input parameters - "true"
+-Print input parameters - "false"
+# force CACTI to model the cache with the 
+# following Ndbl, Ndwl, Nspd, Ndsam,
+# and Ndcm values
+//-Force cache config - "true"
+-Force cache config - "false"
+-Ndwl 1
+-Ndbl 1
+-Nspd 0
+-Ndcm 1
+-Ndsam1 0
+-Ndsam2 0
+
+
diff --git a/ext/mcpat/cacti/cacti.i b/ext/mcpat/cacti/cacti.i
new file mode 100644 (file)
index 0000000..7964138
--- /dev/null
@@ -0,0 +1,8 @@
+%module cacti
+%{
+/* Includes the header in the wrapper code */
+#include "cacti_interface.h"
+%}
+
+/* Parse the header file to generate wrappers */
+%include "cacti_interface.h"
\ No newline at end of file
diff --git a/ext/mcpat/cacti/cacti.mk b/ext/mcpat/cacti/cacti.mk
new file mode 100644 (file)
index 0000000..4d6de8d
--- /dev/null
@@ -0,0 +1,51 @@
+TARGET = cacti
+SHELL = /bin/sh
+.PHONY: all depend clean
+.SUFFIXES: .cc .o
+
+ifndef NTHREADS
+  NTHREADS = 8
+endif
+
+
+LIBS = 
+INCS = -lm
+
+ifeq ($(TAG),dbg)
+  DBG = -Wall 
+  OPT = -ggdb -g -O0 -DNTHREADS=1  -gstabs+
+else
+  DBG = 
+  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS)
+endif
+
+#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
+CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
+CXX = g++ -m32
+CC  = gcc -m32
+
+SRCS  = area.cc bank.cc mat.cc main.cc Ucache.cc io.cc technology.cc basic_circuit.cc parameter.cc \
+               decoder.cc component.cc uca.cc subarray.cc wire.cc htree2.cc \
+               cacti_interface.cc router.cc nuca.cc crossbar.cc arbiter.cc 
+
+OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
+PYTHONLIB_SRCS = $(patsubst main.cc, ,$(SRCS)) obj_$(TAG)/cacti_wrap.cc
+PYTHONLIB_OBJS = $(patsubst %.cc,%.o,$(PYTHONLIB_SRCS)) 
+INCLUDES       = -I /usr/include/python2.4 -I /usr/lib/python2.4/config
+
+all: obj_$(TAG)/$(TARGET)
+       cp -f obj_$(TAG)/$(TARGET) $(TARGET)
+
+obj_$(TAG)/$(TARGET) : $(OBJS)
+       $(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
+
+#obj_$(TAG)/%.o : %.cc
+#      $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
+
+obj_$(TAG)/%.o : %.cc
+       $(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+       -rm -f *.o _cacti.so cacti.py $(TARGET)
+
+
diff --git a/ext/mcpat/cacti/cacti_interface.cc b/ext/mcpat/cacti/cacti_interface.cc
new file mode 100644 (file)
index 0000000..b6d0d13
--- /dev/null
@@ -0,0 +1,173 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <pthread.h>
+
+#include <algorithm>
+#include <cmath>
+#include <ctime>
+#include <iostream>
+
+#include "Ucache.h"
+#include "area.h"
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "const.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+bool mem_array::lt(const mem_array * m1, const mem_array * m2)
+{
+  if (m1->Nspd < m2->Nspd) return true;
+  else if (m1->Nspd > m2->Nspd) return false;
+  else if (m1->Ndwl < m2->Ndwl) return true;
+  else if (m1->Ndwl > m2->Ndwl) return false;
+  else if (m1->Ndbl < m2->Ndbl) return true;
+  else if (m1->Ndbl > m2->Ndbl) return false;
+  else if (m1->deg_bl_muxing < m2->deg_bl_muxing) return true;
+  else if (m1->deg_bl_muxing > m2->deg_bl_muxing) return false;
+  else if (m1->Ndsam_lev_1 < m2->Ndsam_lev_1) return true;
+  else if (m1->Ndsam_lev_1 > m2->Ndsam_lev_1) return false;
+  else if (m1->Ndsam_lev_2 < m2->Ndsam_lev_2) return true;
+  else return false;
+}
+
+
+
+void uca_org_t::find_delay()
+{
+  mem_array * data_arr = data_array2;
+  mem_array * tag_arr  = tag_array2;
+
+  // check whether it is a regular cache or scratch ram
+  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
+  {
+    access_time = data_arr->access_time;
+  }
+  // Both tag and data lookup happen in parallel
+  // and the entire set is sent over the data array h-tree without
+  // waiting for the way-select signal --TODO add the corresponding
+  // power overhead Nav
+  else if (g_ip->fast_access == true)
+  {
+    access_time = MAX(tag_arr->access_time, data_arr->access_time);
+  }
+  // Tag is accessed first. On a hit, way-select signal along with the
+  // address is sent to read/write the appropriate block in the data
+  // array
+  else if (g_ip->is_seq_acc == true)
+  {
+    access_time = tag_arr->access_time + data_arr->access_time;
+  }
+  // Normal access: tag array access and data array access happen in parallel.
+  // But, the data array will wait for the way-select and transfer only the
+  // appropriate block over the h-tree.
+  else
+  {
+    access_time = MAX(tag_arr->access_time + data_arr->delay_senseamp_mux_decoder,
+                      data_arr->delay_before_subarray_output_driver) +
+                  data_arr->delay_from_subarray_output_driver_to_output;
+  }
+}
+
+
+
+void uca_org_t::find_energy()
+{
+  if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache)
+    power = data_array2->power + tag_array2->power;
+  else
+    power = data_array2->power;
+}
+
+
+
+void uca_org_t::find_area()
+{
+  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)//(g_ip->is_cache == false)
+  {
+    cache_ht  = data_array2->height;
+    cache_len = data_array2->width;
+  }
+  else
+  {
+    cache_ht  = MAX(tag_array2->height, data_array2->height);
+    cache_len = tag_array2->width + data_array2->width;
+  }
+  area = cache_ht * cache_len;
+}
+
+void uca_org_t::adjust_area()
+{
+  double area_adjust;
+  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
+  {
+    if (data_array2->area_efficiency/100.0<0.2)
+    {
+        //area_adjust = sqrt(area/(area*(data_array2->area_efficiency/100.0)/0.2));
+        area_adjust = sqrt(0.2/(data_array2->area_efficiency/100.0));
+        cache_ht  = cache_ht/area_adjust;
+        cache_len = cache_len/area_adjust;
+    }
+  }
+  area = cache_ht * cache_len;
+}
+
+void uca_org_t::find_cyc()
+{
+  if ((g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache == false)
+  {
+    cycle_time = data_array2->cycle_time;
+  }
+  else
+  {
+    cycle_time = MAX(tag_array2->cycle_time,
+                    data_array2->cycle_time);
+  }
+}
+
+uca_org_t :: uca_org_t()
+:tag_array2(0),
+ data_array2(0)
+{
+
+}
+
+void uca_org_t :: cleanup()
+{
+          if (data_array2!=0)
+                  delete data_array2;
+          if (tag_array2!=0)
+                  delete tag_array2;
+}
diff --git a/ext/mcpat/cacti/cacti_interface.h b/ext/mcpat/cacti/cacti_interface.h
new file mode 100644 (file)
index 0000000..f375965
--- /dev/null
@@ -0,0 +1,633 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __CACTI_INTERFACE_H__
+#define __CACTI_INTERFACE_H__
+
+#include <iostream>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "const.h"
+
+using namespace std;
+
+
+class min_values_t;
+class mem_array;
+class uca_org_t;
+
+
+class powerComponents
+{
+  public:
+    double dynamic;
+    double leakage;
+    double gate_leakage;
+    double short_circuit;
+    double longer_channel_leakage;
+
+    powerComponents() : dynamic(0), leakage(0), gate_leakage(0), short_circuit(0), longer_channel_leakage(0)  { }
+    powerComponents(const powerComponents & obj) { *this = obj; }
+    powerComponents & operator=(const powerComponents & rhs)
+    {
+      dynamic = rhs.dynamic;
+      leakage = rhs.leakage;
+      gate_leakage  = rhs.gate_leakage;
+      short_circuit = rhs.short_circuit;
+      longer_channel_leakage = rhs.longer_channel_leakage;
+      return *this;
+    }
+    void reset() { dynamic = 0; leakage = 0; gate_leakage = 0; short_circuit = 0;longer_channel_leakage = 0;}
+
+    friend powerComponents operator+(const powerComponents & x, const powerComponents & y);
+    friend powerComponents operator*(const powerComponents & x, double const * const y);
+};
+
+
+
+class powerDef
+{
+  public:
+    powerComponents readOp;
+    powerComponents writeOp;
+    powerComponents searchOp;//Sheng: for CAM and FA
+
+    powerDef() : readOp(), writeOp(), searchOp() { }
+    void reset() { readOp.reset(); writeOp.reset(); searchOp.reset();}
+
+    friend powerDef operator+(const powerDef & x, const powerDef & y);
+    friend powerDef operator*(const powerDef & x, double const * const y);
+};
+
+enum Wire_type
+{
+    Global /* gloabl wires with repeaters */,
+    Global_5 /* 5% delay penalty */,
+    Global_10 /* 10% delay penalty */,
+    Global_20 /* 20% delay penalty */,
+    Global_30 /* 30% delay penalty */,
+    Low_swing /* differential low power wires with high area overhead */,
+    Semi_global /* mid-level wires with repeaters*/,
+    Transmission /* tranmission lines with high area overhead */,
+    Optical /* optical wires */,
+    Invalid_wtype
+};
+
+
+
+class InputParameter
+{
+  public:
+    void parse_cfg(const string & infile);
+
+    bool error_checking();  // return false if the input parameters are problematic
+    void display_ip();
+
+    unsigned int cache_sz;  // in bytes
+    unsigned int line_sz;
+    unsigned int assoc;
+    unsigned int nbanks;
+    unsigned int out_w;// == nr_bits_out
+    bool     specific_tag;
+    unsigned int tag_w;
+    unsigned int access_mode;
+    unsigned int obj_func_dyn_energy;
+    unsigned int obj_func_dyn_power;
+    unsigned int obj_func_leak_power;
+    unsigned int obj_func_cycle_t;
+
+    double   F_sz_nm;          // feature size in nm
+    double   F_sz_um;          // feature size in um
+    unsigned int num_rw_ports;
+    unsigned int num_rd_ports;
+    unsigned int num_wr_ports;
+    unsigned int num_se_rd_ports;  // number of single ended read ports
+    unsigned int num_search_ports;  // Sheng: number of search ports for CAM
+    bool     is_main_mem;
+    bool     is_cache;
+    bool     pure_ram;
+    bool     pure_cam;
+    bool     rpters_in_htree;  // if there are repeaters in htree segment
+    unsigned int ver_htree_wires_over_array;
+    unsigned int broadcast_addr_din_over_ver_htrees;
+    unsigned int temp;
+
+    unsigned int ram_cell_tech_type;
+    unsigned int peri_global_tech_type;
+    unsigned int data_arr_ram_cell_tech_type;
+    unsigned int data_arr_peri_global_tech_type;
+    unsigned int tag_arr_ram_cell_tech_type;
+    unsigned int tag_arr_peri_global_tech_type;
+
+    unsigned int burst_len;
+    unsigned int int_prefetch_w;
+    unsigned int page_sz_bits;
+
+    unsigned int ic_proj_type;      // interconnect_projection_type
+    unsigned int wire_is_mat_type;  // wire_inside_mat_type
+    unsigned int wire_os_mat_type; // wire_outside_mat_type
+    enum Wire_type wt;
+    int force_wiretype;
+    bool print_input_args;
+    unsigned int nuca_cache_sz; // TODO
+    int ndbl, ndwl, nspd, ndsam1, ndsam2, ndcm;
+    bool force_cache_config;
+
+    int cache_level;
+    int cores;
+    int nuca_bank_count;
+    int force_nuca_bank;
+
+    int delay_wt, dynamic_power_wt, leakage_power_wt,
+        cycle_time_wt, area_wt;
+    int delay_wt_nuca, dynamic_power_wt_nuca, leakage_power_wt_nuca,
+        cycle_time_wt_nuca, area_wt_nuca;
+
+    int delay_dev, dynamic_power_dev, leakage_power_dev,
+        cycle_time_dev, area_dev;
+    int delay_dev_nuca, dynamic_power_dev_nuca, leakage_power_dev_nuca,
+        cycle_time_dev_nuca, area_dev_nuca;
+    int ed; //ED or ED2 optimization
+    int nuca;
+
+    bool     fast_access;
+    unsigned int block_sz;  // bytes
+    unsigned int tag_assoc;
+    unsigned int data_assoc;
+    bool     is_seq_acc;
+    bool     fully_assoc;
+    unsigned int nsets;  // == number_of_sets
+    int print_detail;
+
+
+    bool     add_ecc_b_;
+  //parameters for design constraint
+  double throughput;
+  double latency;
+  bool pipelinable;
+  int pipeline_stages;
+  int per_stage_vector;
+  bool with_clock_grid;
+};
+
+
+typedef struct{
+  int Ndwl;
+  int Ndbl;
+  double Nspd;
+  int deg_bl_muxing;
+  int Ndsam_lev_1;
+  int Ndsam_lev_2;
+  int number_activated_mats_horizontal_direction;
+  int number_subbanks;
+  int page_size_in_bits;
+  double delay_route_to_bank;
+  double delay_crossbar;
+  double delay_addr_din_horizontal_htree;
+  double delay_addr_din_vertical_htree;
+  double delay_row_predecode_driver_and_block;
+  double delay_row_decoder;
+  double delay_bitlines;
+  double delay_sense_amp;
+  double delay_subarray_output_driver;
+  double delay_bit_mux_predecode_driver_and_block;
+  double delay_bit_mux_decoder;
+  double delay_senseamp_mux_lev_1_predecode_driver_and_block;
+  double delay_senseamp_mux_lev_1_decoder;
+  double delay_senseamp_mux_lev_2_predecode_driver_and_block;
+  double delay_senseamp_mux_lev_2_decoder;
+  double delay_input_htree;
+  double delay_output_htree;
+  double delay_dout_vertical_htree;
+  double delay_dout_horizontal_htree;
+  double delay_comparator;
+  double access_time;
+  double cycle_time;
+  double multisubbank_interleave_cycle_time;
+  double delay_request_network;
+  double delay_inside_mat;
+  double delay_reply_network;
+  double trcd;
+  double cas_latency;
+  double precharge_delay;
+  powerDef power_routing_to_bank;
+  powerDef power_addr_input_htree;
+  powerDef power_data_input_htree;
+  powerDef power_data_output_htree;
+  powerDef power_addr_horizontal_htree;
+  powerDef power_datain_horizontal_htree;
+  powerDef power_dataout_horizontal_htree;
+  powerDef power_addr_vertical_htree;
+  powerDef power_datain_vertical_htree;
+  powerDef power_row_predecoder_drivers;
+  powerDef power_row_predecoder_blocks;
+  powerDef power_row_decoders;
+  powerDef power_bit_mux_predecoder_drivers;
+  powerDef power_bit_mux_predecoder_blocks;
+  powerDef power_bit_mux_decoders;
+  powerDef power_senseamp_mux_lev_1_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_1_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_1_decoders;
+  powerDef power_senseamp_mux_lev_2_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_2_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_2_decoders;
+  powerDef power_bitlines;
+  powerDef power_sense_amps;
+  powerDef power_prechg_eq_drivers;
+  powerDef power_output_drivers_at_subarray;
+  powerDef power_dataout_vertical_htree;
+  powerDef power_comparators;
+  powerDef power_crossbar;
+  powerDef total_power;
+  double area;
+  double all_banks_height;
+  double all_banks_width;
+  double bank_height;
+  double bank_width;
+  double subarray_memory_cell_area_height;
+  double subarray_memory_cell_area_width;
+  double mat_height;
+  double mat_width;
+  double routing_area_height_within_bank;
+  double routing_area_width_within_bank;
+  double area_efficiency;
+//  double perc_power_dyn_routing_to_bank;
+//  double perc_power_dyn_addr_horizontal_htree;
+//  double perc_power_dyn_datain_horizontal_htree;
+//  double perc_power_dyn_dataout_horizontal_htree;
+//  double perc_power_dyn_addr_vertical_htree;
+//  double perc_power_dyn_datain_vertical_htree;
+//  double perc_power_dyn_row_predecoder_drivers;
+//  double perc_power_dyn_row_predecoder_blocks;
+//  double perc_power_dyn_row_decoders;
+//  double perc_power_dyn_bit_mux_predecoder_drivers;
+//  double perc_power_dyn_bit_mux_predecoder_blocks;
+//  double perc_power_dyn_bit_mux_decoders;
+//  double perc_power_dyn_senseamp_mux_lev_1_predecoder_drivers;
+//  double perc_power_dyn_senseamp_mux_lev_1_predecoder_blocks;
+//  double perc_power_dyn_senseamp_mux_lev_1_decoders;
+//  double perc_power_dyn_senseamp_mux_lev_2_predecoder_drivers;
+//  double perc_power_dyn_senseamp_mux_lev_2_predecoder_blocks;
+//  double perc_power_dyn_senseamp_mux_lev_2_decoders;
+//  double perc_power_dyn_bitlines;
+//  double perc_power_dyn_sense_amps;
+//  double perc_power_dyn_prechg_eq_drivers;
+//  double perc_power_dyn_subarray_output_drivers;
+//  double perc_power_dyn_dataout_vertical_htree;
+//  double perc_power_dyn_comparators;
+//  double perc_power_dyn_crossbar;
+//  double perc_power_dyn_spent_outside_mats;
+//  double perc_power_leak_routing_to_bank;
+//  double perc_power_leak_addr_horizontal_htree;
+//  double perc_power_leak_datain_horizontal_htree;
+//  double perc_power_leak_dataout_horizontal_htree;
+//  double perc_power_leak_addr_vertical_htree;
+//  double perc_power_leak_datain_vertical_htree;
+//  double perc_power_leak_row_predecoder_drivers;
+//  double perc_power_leak_row_predecoder_blocks;
+//  double perc_power_leak_row_decoders;
+//  double perc_power_leak_bit_mux_predecoder_drivers;
+//  double perc_power_leak_bit_mux_predecoder_blocks;
+//  double perc_power_leak_bit_mux_decoders;
+//  double perc_power_leak_senseamp_mux_lev_1_predecoder_drivers;
+//  double perc_power_leak_senseamp_mux_lev_1_predecoder_blocks;
+//  double perc_power_leak_senseamp_mux_lev_1_decoders;
+//  double perc_power_leak_senseamp_mux_lev_2_predecoder_drivers;
+//  double perc_power_leak_senseamp_mux_lev_2_predecoder_blocks;
+//  double perc_power_leak_senseamp_mux_lev_2_decoders;
+//  double perc_power_leak_bitlines;
+//  double perc_power_leak_sense_amps;
+//  double perc_power_leak_prechg_eq_drivers;
+//  double perc_power_leak_subarray_output_drivers;
+//  double perc_power_leak_dataout_vertical_htree;
+//  double perc_power_leak_comparators;
+//  double perc_power_leak_crossbar;
+//  double perc_leak_mats;
+//  double perc_active_mats;
+  double refresh_power;
+  double dram_refresh_period;
+  double dram_array_availability;
+  double dyn_read_energy_from_closed_page;
+  double dyn_read_energy_from_open_page;
+  double leak_power_subbank_closed_page;
+  double leak_power_subbank_open_page;
+  double leak_power_request_and_reply_networks;
+  double activate_energy;
+  double read_energy;
+  double write_energy;
+  double precharge_energy;
+} results_mem_array;
+
+
+class uca_org_t
+{
+  public:
+    mem_array * tag_array2;
+    mem_array * data_array2;
+    double access_time;
+    double cycle_time;
+    double area;
+    double area_efficiency;
+    powerDef power;
+    double leak_power_with_sleep_transistors_in_mats;
+    double cache_ht;
+    double cache_len;
+    char file_n[100];
+    double vdd_periph_global;
+    bool valid;
+    results_mem_array tag_array;
+    results_mem_array data_array;
+
+    uca_org_t();
+    void find_delay();
+    void find_energy();
+    void find_area();
+    void find_cyc();
+    void adjust_area();//for McPAT only to adjust routing overhead
+    void cleanup();
+    ~uca_org_t(){};
+};
+
+void reconfigure(InputParameter *local_interface, uca_org_t *fin_res);
+
+uca_org_t cacti_interface(const string & infile_name);
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(InputParameter * const local_interface);
+//McPAT's plain interface, please keep !!!
+uca_org_t init_interface(InputParameter * const local_interface);
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(
+            int cache_size,
+            int line_size,
+            int associativity,
+            int rw_ports,
+            int excl_read_ports,
+            int excl_write_ports,
+            int single_ended_read_ports,
+            int search_ports,
+            int banks,
+            double tech_node,
+            int output_width,
+            int specific_tag,
+            int tag_width,
+            int access_mode,
+            int cache,
+            int main_mem,
+            int obj_func_delay,
+            int obj_func_dynamic_power,
+            int obj_func_leakage_power,
+            int obj_func_cycle_time,
+            int obj_func_area,
+            int dev_func_delay,
+            int dev_func_dynamic_power,
+            int dev_func_leakage_power,
+            int dev_func_area,
+            int dev_func_cycle_time,
+            int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+            int temp,
+            int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+            int data_arr_ram_cell_tech_flavor_in,
+            int data_arr_peri_global_tech_flavor_in,
+            int tag_arr_ram_cell_tech_flavor_in,
+            int tag_arr_peri_global_tech_flavor_in,
+            int interconnect_projection_type_in,
+            int wire_inside_mat_type_in,
+            int wire_outside_mat_type_in,
+            int REPEATERS_IN_HTREE_SEGMENTS_in,
+            int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
+            int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
+            int PAGE_SIZE_BITS_in,
+            int BURST_LENGTH_in,
+            int INTERNAL_PREFETCH_WIDTH_in,
+            int force_wiretype,
+            int wiretype,
+            int force_config,
+            int ndwl,
+            int ndbl,
+            int nspd,
+            int ndcm,
+            int ndsam1,
+            int ndsam2,
+            int ecc);
+//    int cache_size,
+//    int line_size,
+//    int associativity,
+//    int rw_ports,
+//    int excl_read_ports,
+//    int excl_write_ports,
+//    int single_ended_read_ports,
+//    int banks,
+//    double tech_node,
+//    int output_width,
+//    int specific_tag,
+//    int tag_width,
+//    int access_mode,
+//    int cache,
+//    int main_mem,
+//    int obj_func_delay,
+//    int obj_func_dynamic_power,
+//    int obj_func_leakage_power,
+//    int obj_func_area,
+//    int obj_func_cycle_time,
+//    int dev_func_delay,
+//    int dev_func_dynamic_power,
+//    int dev_func_leakage_power,
+//    int dev_func_area,
+//    int dev_func_cycle_time,
+//    int temp,
+//    int data_arr_ram_cell_tech_flavor_in,
+//    int data_arr_peri_global_tech_flavor_in,
+//    int tag_arr_ram_cell_tech_flavor_in,
+//    int tag_arr_peri_global_tech_flavor_in,
+//    int interconnect_projection_type_in,
+//    int wire_inside_mat_type_in,
+//    int wire_outside_mat_type_in,
+//    int REPEATERS_IN_HTREE_SEGMENTS_in,
+//    int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
+//    int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
+////    double MAXAREACONSTRAINT_PERC_in,
+////    double MAXACCTIMECONSTRAINT_PERC_in,
+////    double MAX_PERC_DIFF_IN_DELAY_FROM_BEST_DELAY_REPEATER_SOLUTION_in,
+//    int PAGE_SIZE_BITS_in,
+//    int BURST_LENGTH_in,
+//    int INTERNAL_PREFETCH_WIDTH_in);
+
+//Naveen's interface
+uca_org_t cacti_interface(
+    int cache_size,
+    int line_size,
+    int associativity,
+    int rw_ports,
+    int excl_read_ports,
+    int excl_write_ports,
+    int single_ended_read_ports,
+    int banks,
+    double tech_node,
+    int page_sz,
+    int burst_length,
+    int pre_width,
+    int output_width,
+    int specific_tag,
+    int tag_width,
+    int access_mode, //0 normal, 1 seq, 2 fast
+    int cache, //scratch ram or cache
+    int main_mem,
+    int obj_func_delay,
+    int obj_func_dynamic_power,
+    int obj_func_leakage_power,
+    int obj_func_area,
+    int obj_func_cycle_time,
+    int dev_func_delay,
+    int dev_func_dynamic_power,
+    int dev_func_leakage_power,
+    int dev_func_area,
+    int dev_func_cycle_time,
+    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+    int temp,
+    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+    int data_arr_ram_cell_tech_flavor_in,
+    int data_arr_peri_global_tech_flavor_in,
+    int tag_arr_ram_cell_tech_flavor_in,
+    int tag_arr_peri_global_tech_flavor_in,
+    int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
+    int wire_inside_mat_type_in,
+    int wire_outside_mat_type_in,
+    int is_nuca, // 0 - UCA, 1 - NUCA
+    int core_count,
+    int cache_level, // 0 - L2, 1 - L3
+    int nuca_bank_count,
+    int nuca_obj_func_delay,
+    int nuca_obj_func_dynamic_power,
+    int nuca_obj_func_leakage_power,
+    int nuca_obj_func_area,
+    int nuca_obj_func_cycle_time,
+    int nuca_dev_func_delay,
+    int nuca_dev_func_dynamic_power,
+    int nuca_dev_func_leakage_power,
+    int nuca_dev_func_area,
+    int nuca_dev_func_cycle_time,
+    int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
+    int p_input);
+
+class mem_array
+{
+  public:
+  int    Ndcm;
+  int    Ndwl;
+  int    Ndbl;
+  double Nspd;
+  int    deg_bl_muxing;
+  int    Ndsam_lev_1;
+  int    Ndsam_lev_2;
+  double access_time;
+  double cycle_time;
+  double multisubbank_interleave_cycle_time;
+  double area_ram_cells;
+  double area;
+  powerDef power;
+  double delay_senseamp_mux_decoder;
+  double delay_before_subarray_output_driver;
+  double delay_from_subarray_output_driver_to_output;
+  double height;
+  double width;
+
+  double mat_height;
+  double mat_length;
+  double subarray_length;
+  double subarray_height;
+
+  double delay_route_to_bank,
+         delay_input_htree,
+         delay_row_predecode_driver_and_block,
+         delay_row_decoder,
+         delay_bitlines,
+         delay_sense_amp,
+         delay_subarray_output_driver,
+         delay_dout_htree,
+         delay_comparator,
+         delay_matchlines;
+
+  double all_banks_height,
+         all_banks_width,
+         area_efficiency;
+
+  powerDef power_routing_to_bank;
+  powerDef power_addr_input_htree;
+  powerDef power_data_input_htree;
+  powerDef power_data_output_htree;
+  powerDef power_htree_in_search;
+  powerDef power_htree_out_search;
+  powerDef power_row_predecoder_drivers;
+  powerDef power_row_predecoder_blocks;
+  powerDef power_row_decoders;
+  powerDef power_bit_mux_predecoder_drivers;
+  powerDef power_bit_mux_predecoder_blocks;
+  powerDef power_bit_mux_decoders;
+  powerDef power_senseamp_mux_lev_1_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_1_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_1_decoders;
+  powerDef power_senseamp_mux_lev_2_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_2_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_2_decoders;
+  powerDef power_bitlines;
+  powerDef power_sense_amps;
+  powerDef power_prechg_eq_drivers;
+  powerDef power_output_drivers_at_subarray;
+  powerDef power_dataout_vertical_htree;
+  powerDef power_comparators;
+
+  powerDef power_cam_bitline_precharge_eq_drv;
+  powerDef power_searchline;
+  powerDef power_searchline_precharge;
+  powerDef power_matchlines;
+  powerDef power_matchline_precharge;
+  powerDef power_matchline_to_wordline_drv;
+
+  min_values_t *arr_min;
+  enum Wire_type wt;
+
+  // dram stats
+  double activate_energy, read_energy, write_energy, precharge_energy,
+  refresh_power, leak_power_subbank_closed_page, leak_power_subbank_open_page,
+  leak_power_request_and_reply_networks;
+
+  double precharge_delay;
+
+  static bool lt(const mem_array * m1, const mem_array * m2);
+};
+
+
+#endif
diff --git a/ext/mcpat/cacti/component.cc b/ext/mcpat/cacti/component.cc
new file mode 100644 (file)
index 0000000..7331084
--- /dev/null
@@ -0,0 +1,236 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "bank.h"
+#include "component.h"
+#include "decoder.h"
+
+using namespace std;
+
+
+
+Component::Component()
+  :area(), power(), rt_power(),delay(0)
+{
+}
+
+
+
+Component::~Component()
+{
+}
+
+
+
+double Component::compute_diffusion_width(int num_stacked_in, int num_folded_tr)
+{
+  double w_poly = g_ip->F_sz_um;
+  double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
+  double total_diff_w = 2 * spacing_poly_to_poly +  // for both source and drain
+                        num_stacked_in * w_poly +
+                        (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
+
+  if (num_folded_tr > 1)
+  {
+    total_diff_w += (num_folded_tr - 2) * 2 * spacing_poly_to_poly +
+                    (num_folded_tr - 1) * num_stacked_in * w_poly +
+                    (num_folded_tr - 1) * (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
+  }
+
+  return total_diff_w;
+}
+
+
+
+double Component::compute_gate_area(
+    int gate_type,
+    int num_inputs,
+    double w_pmos,
+    double w_nmos,
+    double h_gate)
+{
+  if (w_pmos <= 0.0 || w_nmos <= 0.0)
+  {
+    return 0.0;
+  }
+
+  double w_folded_pmos, w_folded_nmos;
+  int    num_folded_pmos, num_folded_nmos;
+  double total_ndiff_w, total_pdiff_w;
+  Area gate;
+
+  double h_tr_region  = h_gate - 2 * g_tp.HPOWERRAIL;
+  double ratio_p_to_n = w_pmos / (w_pmos + w_nmos);
+
+  if (ratio_p_to_n >= 1 || ratio_p_to_n <= 0)
+  {
+    return 0.0;
+  }
+
+  w_folded_pmos  = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * ratio_p_to_n;
+  w_folded_nmos  = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * (1 - ratio_p_to_n);
+  assert(w_folded_pmos > 0);
+
+  num_folded_pmos = (int) (ceil(w_pmos / w_folded_pmos));
+  num_folded_nmos = (int) (ceil(w_nmos / w_folded_nmos));
+
+  switch (gate_type)
+  {
+    case INV:
+      total_ndiff_w = compute_diffusion_width(1, num_folded_nmos);
+      total_pdiff_w = compute_diffusion_width(1, num_folded_pmos);
+      break;
+
+    case NOR:
+      total_ndiff_w = compute_diffusion_width(1, num_inputs * num_folded_nmos);
+      total_pdiff_w = compute_diffusion_width(num_inputs, num_folded_pmos);
+      break;
+
+    case NAND:
+      total_ndiff_w = compute_diffusion_width(num_inputs, num_folded_nmos);
+      total_pdiff_w = compute_diffusion_width(1, num_inputs * num_folded_pmos);
+      break;
+    default:
+      cout << "Unknown gate type: " << gate_type << endl;
+      exit(1);
+  }
+
+  gate.w = MAX(total_ndiff_w, total_pdiff_w);
+
+  if (w_folded_nmos > w_nmos)
+  {
+    //means that the height of the gate can
+    //be made smaller than the input height specified, so calculate the height of the gate.
+    gate.h = w_nmos + w_pmos + g_tp.MIN_GAP_BET_P_AND_N_DIFFS + 2 * g_tp.HPOWERRAIL;
+  }
+  else
+  {
+    gate.h = h_gate;
+  }
+  return gate.get_area();
+}
+
+
+
+double Component::compute_tr_width_after_folding(
+    double input_width,
+    double threshold_folding_width)
+{//This is actually the width of the cell not the width of a device.
+//The width of a cell and the width of a device is orthogonal.
+  if (input_width <= 0)
+  {
+    return 0;
+  }
+
+  int    num_folded_tr        = (int) (ceil(input_width / threshold_folding_width));
+  double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
+  double width_poly           = g_ip->F_sz_um;
+  double total_diff_width     = num_folded_tr * width_poly + (num_folded_tr + 1) * spacing_poly_to_poly;
+
+  return total_diff_width;
+}
+
+
+
+double Component::height_sense_amplifier(double pitch_sense_amp)
+{
+  // compute the height occupied by all PMOS transistors
+  double h_pmos_tr = compute_tr_width_after_folding(g_tp.w_sense_p, pitch_sense_amp) * 2 +
+                     compute_tr_width_after_folding(g_tp.w_iso, pitch_sense_amp) +
+                     2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
+
+  // compute the height occupied by all NMOS transistors
+  double h_nmos_tr = compute_tr_width_after_folding(g_tp.w_sense_n, pitch_sense_amp) * 2 +
+                     compute_tr_width_after_folding(g_tp.w_sense_en, pitch_sense_amp) +
+                     2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
+
+  // compute total height by considering gap between the p and n diffusion areas
+  return h_pmos_tr + h_nmos_tr + g_tp.MIN_GAP_BET_P_AND_N_DIFFS;
+}
+
+
+
+int Component::logical_effort(
+    int num_gates_min,
+    double g,
+    double F,
+    double * w_n,
+    double * w_p,
+    double C_load,
+    double p_to_n_sz_ratio,
+    bool   is_dram_,
+    bool   is_wl_tr_,
+    double max_w_nmos)
+{
+  int num_gates = (int) (log(F) / log(fopt));
+
+  // check if num_gates is odd. if so, add 1 to make it even
+  num_gates+= (num_gates % 2) ? 1 : 0;
+  num_gates = MAX(num_gates, num_gates_min);
+
+  // recalculate the effective fanout of each stage
+  double f = pow(F, 1.0 / num_gates);
+  int    i = num_gates - 1;
+  double C_in = C_load / f;
+  w_n[i]  = (1.0 / (1.0 + p_to_n_sz_ratio)) * C_in / gate_C(1, 0, is_dram_, false, is_wl_tr_);
+  w_n[i]  = MAX(w_n[i], g_tp.min_w_nmos_);
+  w_p[i]  = p_to_n_sz_ratio * w_n[i];
+
+  if (w_n[i] > max_w_nmos)
+  {
+    double C_ld = gate_C((1 + p_to_n_sz_ratio) * max_w_nmos, 0, is_dram_, false, is_wl_tr_);
+    F = g * C_ld / gate_C(w_n[0] + w_p[0], 0, is_dram_, false, is_wl_tr_);
+    num_gates = (int) (log(F) / log(fopt)) + 1;
+    num_gates+= (num_gates % 2) ? 1 : 0;
+    num_gates = MAX(num_gates, num_gates_min);
+    f = pow(F, 1.0 / (num_gates - 1));
+    i = num_gates - 1;
+    w_n[i]  = max_w_nmos;
+    w_p[i]  = p_to_n_sz_ratio * w_n[i];
+  }
+
+  for (i = num_gates - 2; i >= 1; i--)
+  {
+    w_n[i] = MAX(w_n[i+1] / f, g_tp.min_w_nmos_);
+    w_p[i] = p_to_n_sz_ratio * w_n[i];
+  }
+
+  assert(num_gates <= MAX_NUMBER_GATES_STAGE);
+  return num_gates;
+}
+
diff --git a/ext/mcpat/cacti/component.h b/ext/mcpat/cacti/component.h
new file mode 100644 (file)
index 0000000..75e2cb0
--- /dev/null
@@ -0,0 +1,84 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __COMPONENT_H__
+#define __COMPONENT_H__
+
+#include "area.h"
+#include "parameter.h"
+
+using namespace std;
+
+class Crossbar;
+class Bank;
+
+class Component
+{
+  public:
+    Component();
+    ~Component();
+
+    Area area;
+    powerDef power,rt_power;
+    double delay;
+    double cycle_time;
+
+    double compute_gate_area(
+        int gate_type,
+        int num_inputs,
+        double w_pmos,
+        double w_nmos,
+        double h_gate);
+
+    double compute_tr_width_after_folding(double input_width, double threshold_folding_width);
+    double height_sense_amplifier(double pitch_sense_amp);
+
+  protected:
+    int logical_effort(
+        int    num_gates_min,
+        double g,
+        double F,
+        double * w_n,
+        double * w_p,
+        double C_load,
+        double p_to_n_sz_ratio,
+        bool   is_dram_,
+        bool   is_wl_tr_,
+        double max_w_nmos);
+
+  private:
+    double compute_diffusion_width(int num_stacked_in, int num_folded_tr);
+};
+
+#endif
+
diff --git a/ext/mcpat/cacti/const.h b/ext/mcpat/cacti/const.h
new file mode 100644 (file)
index 0000000..aef7d01
--- /dev/null
@@ -0,0 +1,270 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef __CONST_H__
+#define __CONST_H__
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*  The following are things you might want to change
+ *  when compiling
+ */
+
+/*
+ * Address bits in a word, and number of output bits from the cache
+ */
+
+/*
+was: #define ADDRESS_BITS 32
+now: I'm using 42 bits as in the Power4,
+since that's bigger then the 36 bits on the Pentium 4
+and 40 bits on the Opteron
+*/
+const int ADDRESS_BITS = 42;
+
+/*dt: In addition to the tag bits, the tags also include 1 valid bit, 1 dirty bit, 2 bits for a 4-state
+  cache coherency protocoll (MESI), 1 bit for MRU (change this to log(ways) for full LRU).
+  So in total we have 1 + 1 + 2 + 1 = 5 */
+const int EXTRA_TAG_BITS = 5;
+
+/* limits on the various N parameters */
+
+const unsigned int MAXDATAN     = 512;      // maximum for Ndwl and Ndbl
+const unsigned int MAXSUBARRAYS = 1048576;  // maximum subarrays for data and tag arrays
+const unsigned int MAXDATASPD   = 256;      // maximum for Nspd
+const unsigned int MAX_COL_MUX  = 256;
+
+
+
+#define ROUTER_TYPES 3
+#define WIRE_TYPES 6
+
+const double Cpolywire = 0;
+
+
+/* Threshold voltages (as a proportion of Vdd)
+   If you don't know them, set all values to 0.5 */
+#define VTHFA1         0.452
+#define VTHFA2         0.304
+#define VTHFA3         0.420
+#define VTHFA4         0.413
+#define VTHFA5         0.405
+#define VTHFA6         0.452
+#define VSINV          0.452
+#define VTHCOMPINV     0.437
+#define VTHMUXNAND     0.548  // TODO : this constant must be revisited
+#define VTHEVALINV     0.452
+#define VTHSENSEEXTDRV 0.438
+
+
+//WmuxdrvNANDn and WmuxdrvNANDp are no longer being used but it's part of the old
+//delay_comparator function which we are using exactly as it used to be, so just setting these to 0
+const double WmuxdrvNANDn = 0;
+const double WmuxdrvNANDp = 0;
+
+
+/*===================================================================*/
+/*
+ * The following are things you probably wouldn't want to change.
+ */
+
+#define BIGNUM 1e30
+#define INF 9999999
+#define MAX(a,b) (((a)>(b))?(a):(b))
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+/* Used to communicate with the horowitz model */
+#define RISE 1
+#define FALL 0
+#define NCH  1
+#define PCH  0
+
+
+#define EPSILON 0.5 //v4.1: This constant is being used in order to fix floating point -> integer
+//conversion problems that were occuring within CACTI. Typical problem that was occuring was
+//that with different compilers a floating point number like 3.0 would get represented as either
+//2.9999....or 3.00000001 and then the integer part of the floating point number (3.0) would
+//be computed differently depending on the compiler. What we are doing now is to replace
+//int (x) with (int) (x+EPSILON) where EPSILON is 0.5. This would fix such problems. Note that
+//this works only when x is an integer >= 0.
+/*
+ * Sheng thinks this is more a solution to solve the simple truncate problem
+ * (http://www.cs.tut.fi/~jkorpela/round.html) rather than the problem mentioned above.
+ * Unfortunately, this solution causes nasty bugs (different results when using O0 and O3).
+ * Moreover, round is not correct in CACTI since when an extra fraction of bit/line is needed,
+ * we need to provide a complete bit/line even the fraction is just 0.01.
+ * So, in later version than 6.5 we use (int)ceil() to get double to int conversion.
+ */
+
+#define EPSILON2 0.1
+#define EPSILON3 0.6
+
+
+#define MINSUBARRAYROWS 16 //For simplicity in modeling, for the row decoding structure, we assume
+//that each row predecode block is composed of at least one 2-4 decoder. When the outputs from the
+//row predecode blocks are combined this means that there are at least 4*4=16 row decode outputs
+#define MAXSUBARRAYROWS 262144 //Each row predecode block produces a max of 2^9 outputs. So
+//the maximum number of row decode outputs will be 2^9*2^9
+#define MINSUBARRAYCOLS 2
+#define MAXSUBARRAYCOLS 262144
+
+
+#define INV 0
+#define NOR 1
+#define NAND 2
+
+
+#define NUMBER_TECH_FLAVORS 4
+
+#define NUMBER_INTERCONNECT_PROJECTION_TYPES 2 //aggressive and conservative
+//0 = Aggressive projections, 1 = Conservative projections
+#define NUMBER_WIRE_TYPES 4 //local, semi-global and global
+//1 = 'Semi-global' wire type, 2 = 'Global' wire type
+
+
+const int dram_cell_tech_flavor = 3;
+
+
+#define VBITSENSEMIN 0.08 //minimum bitline sense voltage is fixed to be 80 mV.
+
+#define fopt 4.0
+
+#define INPUT_WIRE_TO_INPUT_GATE_CAP_RATIO 0
+#define BUFFER_SEPARATION_LENGTH_MULTIPLIER 1
+#define NUMBER_MATS_PER_REDUNDANT_MAT 8
+
+#define NUMBER_STACKED_DIE_LAYERS 1
+
+// this variable can be set to carry out solution optimization for
+// a maximum area allocation.
+#define STACKED_DIE_LAYER_ALLOTED_AREA_mm2 0 //6.24 //6.21//71.5
+
+// this variable can also be employed when solution optimization
+// with maximum area allocation is carried out.
+#define MAX_PERCENT_AWAY_FROM_ALLOTED_AREA 50
+
+// this variable can also be employed when solution optimization
+// with maximum area allocation is carried out.
+#define MIN_AREA_EFFICIENCY 20
+
+// this variable can be employed when solution with a desired
+// aspect ratio is required.
+#define STACKED_DIE_LAYER_ASPECT_RATIO 1
+
+// this variable can be employed when solution with a desired
+// aspect ratio is required.
+#define MAX_PERCENT_AWAY_FROM_ASPECT_RATIO 101
+
+// this variable can be employed to carry out solution optimization
+// for a certain target random cycle time.
+#define TARGET_CYCLE_TIME_ns 1000000000
+
+#define NUMBER_PIPELINE_STAGES 4
+
+// this can be used to model the length of interconnect
+// between a bank and a crossbar
+#define LENGTH_INTERCONNECT_FROM_BANK_TO_CROSSBAR 0 //3791 // 2880//micron
+
+#define IS_CROSSBAR 0
+#define NUMBER_INPUT_PORTS_CROSSBAR 8
+#define NUMBER_OUTPUT_PORTS_CROSSBAR 8
+#define NUMBER_SIGNALS_PER_PORT_CROSSBAR 256
+
+
+#define MAT_LEAKAGE_REDUCTION_DUE_TO_SLEEP_TRANSISTORS_FACTOR 1
+#define LEAKAGE_REDUCTION_DUE_TO_LONG_CHANNEL_HP_TRANSISTORS_FACTOR 1
+
+#define PAGE_MODE 0
+
+#define MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA 60
+// We are actually not using this variable in the CACTI code. We just want to acknowledge that
+// this current should be multiplied by the DDR(n) system VDD value to compute the standby power
+// consumed during precharge.
+
+
+const double VDD_STORAGE_LOSS_FRACTION_WORST = 0.125;
+const double CU_RESISTIVITY = 0.022; //ohm-micron
+const double BULK_CU_RESISTIVITY = 0.018; //ohm-micron
+const double PERMITTIVITY_FREE_SPACE = 8.854e-18; //F/micron
+
+const static uint32_t sram_num_cells_wl_stitching_ = 16;
+const static uint32_t dram_num_cells_wl_stitching_ = 64;
+const static uint32_t comm_dram_num_cells_wl_stitching_ = 256;
+const static double num_bits_per_ecc_b_          = 8.0;
+
+const double    bit_to_byte  = 8.0;
+
+#define MAX_NUMBER_GATES_STAGE 20
+#define MAX_NUMBER_HTREE_NODES 20
+#define NAND2_LEAK_STACK_FACTOR 0.2
+#define NAND3_LEAK_STACK_FACTOR 0.2
+#define NOR2_LEAK_STACK_FACTOR 0.2
+#define INV_LEAK_STACK_FACTOR  0.5
+#define MAX_NUMBER_ARRAY_PARTITIONS 1000000
+
+// abbreviations used in this project
+// ----------------------------------
+//
+//  num  : number
+//  rw   : read/write
+//  rd   : read
+//  wr   : write
+//  se   : single-ended
+//  sz   : size
+//  F    : feature
+//  w    : width
+//  h    : height or horizontal
+//  v    : vertical or velocity
+
+
+enum ram_cell_tech_type_num
+{
+  itrs_hp   = 0,
+  itrs_lstp = 1,
+  itrs_lop  = 2,
+  lp_dram   = 3,
+  comm_dram = 4
+};
+
+const double pppm[4]      = {1,1,1,1};
+const double pppm_lkg[4]  = {0,1,1,0};
+const double pppm_dyn[4]  = {1,0,0,0};
+const double pppm_Isub[4] = {0,1,0,0};
+const double pppm_Ig[4]   = {0,0,1,0};
+const double pppm_sc[4]   = {0,0,0,1};
+
+
+
+#endif
diff --git a/ext/mcpat/cacti/contention.dat b/ext/mcpat/cacti/contention.dat
new file mode 100755 (executable)
index 0000000..826553e
--- /dev/null
@@ -0,0 +1,126 @@
+l34c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l34c64l2b: 9 11 19 29 43 62 81 102
+l34c64l4b: 6 8 12 17 24 29 39 47
+l34c64l8b: 7 8 10 14 18 22 25 30
+l34c64l16b: 7 7 9 12 14 17 20 24
+l34c64l32b: 7 7 9 12 14 17 20 24 -r
+l34c64l64b: 7 7 9 12 14 17 20 24 -r
+l34c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l34c128l2b: 4 10 19 30 44 64 82 103
+l34c128l4b: 3 6 11 17 24 31 38 47
+l34c128l8b: 3 5 9 13 17 21 25 29
+l34c128l16b: 4 5 7 10 13 16 19 22
+l34c128l32b: 4 5 7 10 13 16 19 22 -r
+l34c128l64b: 4 5 7 10 13 16 19 22 -r
+l34c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l34c256l2b: 3 10 19 30 44 63 82 103
+l34c256l4b: 3 6 11 17 24 31 38 47
+l34c256l8b: 2 5 8 12 16 20 24 29
+l34c256l16b: 2 4 7 9 12 15 18 21
+l34c256l32b: 2 4 7 9 12 15 18 21 -r
+l34c256l64b: 2 4 7 9 12 15 18 21 -r
+l38c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l38c64l2b: 57 59 77 90 137 187 219 245
+l38c64l4b: 35 40 48 56 43 61 80 101
+l38c64l8b: 18 27 41 45 52 58 58 58  -r
+l38c64l16b: 16 17 19 35 40 49 53 53 -r
+l38c64l32b: 15 15 17 19 22 25 30 30 -r
+l38c64l64b: 15 15 17 19 22 25 30 30 -r
+l38c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l38c128l2b: 38 50 78 93 139 188 220 245
+l38c128l4b: 29 37 46 56 43 61 81 102
+l38c128l8b: 16 30 39 44 50 57 57 57 -r
+l38c128l16b: 14 16 19 33 40 47 52 52 -r
+l38c128l32b: 14 15 17 20 23 27 31 31 -r
+l38c128l64b: 14 15 17 20 23 27 31 31 -r
+l38c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l38c256l2b: 35 50 78 94 139 188 220 246 
+l38c256l4b: 28 36 45 55 55 61 81 102
+l38c256l8b: 17 30 38 43 50 57 57 57 -r
+l38c256l16b: 15 17 21 32 40 47 51 51
+l38c256l32b: 15 17 19 21 24 29 33 33
+l38c256l64b: 15 17 19 21 24 29 33 33 -r
+l316c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c64l4b: 34 35 78 126 178 220 252 274
+l316c64l8b: 9 11 23 43 62 87 105 130
+l316c64l16b: 7 9 13 23 33 45 56 67
+l316c64l32b: 5 6 7 10 13 19 25 30
+l316c64l64b: 4 5 6 8 10 14 18 21
+l316c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c128l2b: 25 131 243 1000 1000 1000 1000 1000
+l316c128l4b: 8 28 79 127 179 221 253 274
+l316c128l8b: 4 9 22 43 62 88 106 131
+l316c128l16b: 4 6 11 21 32 44 55 67
+l316c128l32b: 4 6 11 12 12 18 24 29
+l316c128l64b: 2 3 5 7 9 13 17 21
+l316c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c256l4b: 5 28 80 128 180 221 253 274
+l316c256l8b: 3 8 22 43 63 88 107 131
+l316c256l16b: 2 5 11 21 32 44 55 67
+l316c256l32b: 2 3 5 8 12 18 24 29
+l316c256l64b: 2 3 4 6 9 13 17 21
+l24c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c64l2b: 10 12 24 41 60 86 105 122
+l24c64l4b: 5 7 13 20 29 38 47 56
+l24c64l8b: 5 6 9 14 18 24 29 35
+l24c64l16b: 4 5 7 10 12 16 19 22
+l24c64l32b: 5 5 6 8 10 12 14 17
+l24c64l64b: 5 5 6 8 10 12 14 16
+l24c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c128l4b: 3 7 13 20 29 38 47 57
+l24c128l8b: 3 5 9 13 18 23 29 35
+l24c128l16b: 3 4 6 9 12 15 19 22
+l24c128l32b: 3 4 5 7 9 11 14 16
+l24c128l64b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c256l4b: 2 6 13 20 29 38 47 57
+l24c256l8b: 2 4 8 13 18 23 28 35
+l24c256l16b: 2 3 6 8 11 15 18 22
+l24c256l32b: 2 3 5 6 8 11 14 16
+l24c256l64b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c64l2b: 46 52 117 157 188 225 246 261
+l28c64l4b: 19 25 39 54 96 107 120 150
+l28c64l8b: 9 12 21 30 39 47 58 79
+l28c64l16b: 8 9 11 16 25 32 37 42
+l28c64l32b: 7 8 9 11 14 19 23 28
+l28c64l64b: 7 7 8 10 12 14 18 22 
+l28c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c128l4b: 12 22 39 54 98 108 130 151
+l28c128l8b: 7 12 21 30 39 48 59 80
+l28c128l16b: 6 8 11 16 24 31 37 42
+l28c128l32b: 6 7 9 11 14 19 24 28
+l28c128l64b: 6 7 9 11 14 19 24 28
+l28c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c256l4b: 12 22 39 54 100 108 130 152
+l28c256l8b: 7 12 21 30 39 48 59 81
+l28c256l16b: 6 8 11 16 24 31 37 42
+l28c256l32b: 6 7 9 11 14 19 24 28
+l28c256l64b: 6 7 9 11 14 19 24 28
+l216c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c64l4b: 34 35 78 126 178 220 252 274
+l216c64l8b: 9 11 23 43 62 87 105 130
+l216c64l16b: 7 9 13 23 33 45 56 67
+l216c64l32b: 5 6 7 10 13 19 25 30
+l216c64l64b: 4 5 6 8 10 14 18 21
+l216c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c128l2b: 25 131 243 1000 1000 1000 1000 1000
+l216c128l4b: 8 28 79 127 179 221 253 274
+l216c128l8b: 4 9 22 43 62 88 106 131
+l216c128l16b: 4 6 11 21 32 44 55 67
+l216c128l32b: 4 6 11 12 12 18 24 29
+l216c128l64b: 2 3 5 7 9 13 17 21
+l216c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c256l4b: 5 28 80 128 180 221 253 274
+l216c256l8b: 3 8 22 43 63 88 107 131
+l216c256l16b: 2 5 11 21 32 44 55 67
+l216c256l32b: 2 3 5 8 12 18 24 29
+l216c256l64b: 2 3 4 6 9 13 17 21
diff --git a/ext/mcpat/cacti/crossbar.cc b/ext/mcpat/cacti/crossbar.cc
new file mode 100644 (file)
index 0000000..a3d8532
--- /dev/null
@@ -0,0 +1,161 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "crossbar.h"
+
+#define ASPECT_THRESHOLD .8
+#define ADJ 1
+
+Crossbar::Crossbar(
+    double n_inp_,
+    double n_out_,
+    double flit_size_,
+    TechnologyParameter::DeviceType *dt
+    ):n_inp(n_inp_), n_out(n_out_), flit_size(flit_size_), deviceType(dt)
+{
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  Vdd = dt->Vdd;
+  CB_ADJ = 1;
+}
+
+Crossbar::~Crossbar(){}
+
+double Crossbar::output_buffer()
+{
+
+  //Wire winit(4, 4);
+  double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
+  Wire w1(g_ip->wt, l_eff);
+  //double s1 = w1.repeater_size *l_eff*ADJ/w1.repeater_spacing;
+  double s1 = w1.repeater_size * (l_eff <w1.repeater_spacing?  l_eff *ADJ/w1.repeater_spacing : ADJ);
+  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
+  // the model assumes input capacitance of the wire driver = input capacitance of nand + nor = input cap of the driver transistor
+  TriS1 = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
+  TriS2 = s1; //driver transistor
+
+  if (TriS1 < 1)
+    TriS1 = 1;
+
+  double input_cap = gate_C(TriS1*(2*min_w_pmos + g_tp.min_w_nmos_), 0) +
+    gate_C(TriS1*(min_w_pmos + 2*g_tp.min_w_nmos_), 0);
+//  input_cap += drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+//    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
+//    gate_C(TriS2*g_tp.min_w_nmos_, 0)+
+//    drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
+//    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+//    gate_C(TriS2*min_w_pmos, 0);
+  tri_int_cap = drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
+    gate_C(TriS2*g_tp.min_w_nmos_, 0)+
+    drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
+    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(TriS2*min_w_pmos, 0);
+  double output_cap = drain_C_(TriS2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(TriS2*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def);
+  double ctr_cap = gate_C(TriS2 *(min_w_pmos + g_tp.min_w_nmos_), 0);
+
+  tri_inp_cap = input_cap;
+  tri_out_cap = output_cap;
+  tri_ctr_cap = ctr_cap;
+  return input_cap + output_cap + ctr_cap;
+}
+
+void Crossbar::compute_power()
+{
+
+  Wire winit(4, 4);
+  double tri_cap = output_buffer();
+  assert(tri_cap > 0);
+  //area of a tristate logic
+  double g_area = compute_gate_area(INV, 1, TriS2*g_tp.min_w_nmos_, TriS2*min_w_pmos, g_tp.cell_h_def);
+  g_area *= 2; // to model area of output transistors
+  g_area += compute_gate_area (NAND, 2, TriS1*2*g_tp.min_w_nmos_, TriS1*min_w_pmos, g_tp.cell_h_def);
+  g_area += compute_gate_area (NOR, 2, TriS1*g_tp.min_w_nmos_, TriS1*2*min_w_pmos, g_tp.cell_h_def);
+  double width /*per tristate*/ = g_area/(CB_ADJ * g_tp.cell_h_def);
+  // effective no. of tristate buffers that need to be laid side by side
+  int ntri = (int)ceil(g_tp.cell_h_def/(g_tp.wire_outside_mat.pitch));
+  double wire_len = MAX(width*ntri*n_out, flit_size*g_tp.wire_outside_mat.pitch*n_out);
+  Wire w1(g_ip->wt, wire_len);
+
+  area.w = wire_len;
+  area.h = g_tp.wire_outside_mat.pitch*n_inp*flit_size * CB_ADJ;
+  Wire w2(g_ip->wt, area.h);
+
+  double aspect_ratio_cb = (area.h/area.w)*(n_out/n_inp);
+  if (aspect_ratio_cb > 1) aspect_ratio_cb = 1/aspect_ratio_cb;
+
+  if (aspect_ratio_cb < ASPECT_THRESHOLD) {
+    if (n_out > 2 && n_inp > 2) {
+      CB_ADJ+=0.2;
+      //cout << "CB ADJ " << CB_ADJ << endl;
+      if (CB_ADJ < 4) {
+        this->compute_power();
+      }
+    }
+  }
+
+
+
+  power.readOp.dynamic = (w1.power.readOp.dynamic + w2.power.readOp.dynamic + (tri_inp_cap * n_out + tri_out_cap * n_inp + tri_ctr_cap + tri_int_cap) * Vdd*Vdd)*flit_size;
+  power.readOp.leakage      =  n_inp * n_out * flit_size * (
+    cmos_Isub_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
+        cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
+        cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
+    w1.power.readOp.leakage + w2.power.readOp.leakage);
+  power.readOp.gate_leakage = n_inp * n_out * flit_size * (
+          cmos_Ig_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
+          cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
+          cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
+          w1.power.readOp.gate_leakage + w2.power.readOp.gate_leakage);
+
+  // delay calculation
+  double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
+  Wire wdriver(g_ip->wt, l_eff);
+  double res = g_tp.wire_outside_mat.R_per_um * (area.w+area.h) + tr_R_on(g_tp.min_w_nmos_*wdriver.repeater_size, NCH, 1);
+  double cap = g_tp.wire_outside_mat.C_per_um * (area.w + area.h) + n_out*tri_inp_cap + n_inp*tri_out_cap;
+  delay = horowitz(w1.signal_rise_time(), res*cap, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
+
+  Wire wreset();
+}
+
+void Crossbar::print_crossbar()
+{
+  cout << "\nCrossbar Stats (" << n_inp << "x" << n_out << ")\n\n";
+  cout << "Flit size        : " << flit_size << " bits" << endl;
+  cout << "Width            : " << area.w << " u" << endl;
+  cout << "Height           : " << area.h << " u" << endl;
+  cout << "Dynamic Power    : " << power.readOp.dynamic*1e9 * MIN(n_inp, n_out) << " (nJ)" << endl;
+  cout << "Leakage Power    : " << power.readOp.leakage*1e3 << " (mW)" << endl;
+  cout << "Gate Leakage Power    : " << power.readOp.gate_leakage*1e3 << " (mW)" << endl;
+  cout << "Crossbar Delay   : " << delay*1e12 << " ps\n";
+}
+
+
diff --git a/ext/mcpat/cacti/crossbar.h b/ext/mcpat/cacti/crossbar.h
new file mode 100644 (file)
index 0000000..3b92651
--- /dev/null
@@ -0,0 +1,85 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __CROSSBAR__
+#define __CROSSBAR__
+
+#include <assert.h>
+
+#include <iostream>
+
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "mat.h"
+#include "parameter.h"
+#include "wire.h"
+
+class Crossbar : public Component
+{
+  public:
+    Crossbar(
+      double in,
+      double out,
+      double flit_sz,
+      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Crossbar();
+
+    void print_crossbar();
+    double output_buffer();
+    void compute_power();
+
+    double n_inp, n_out;
+    double flit_size;
+    double tri_inp_cap, tri_out_cap, tri_ctr_cap, tri_int_cap;
+
+  private:
+          double CB_ADJ;
+          /*
+           * Adjust factor of the height of the cross-point (tri-state buffer) cell (layout) in crossbar
+           * buffer is adjusted to get an aspect ratio of whole cross bar close to one;
+           * when adjust the ratio, the number of wires route over the tri-state buffers does not change,
+           * however, the effective wiring pitch changes. Specifically, since CB_ADJ will increase
+           * during the adjust, the tri-state buffer will become taller and thiner, and the effective wiring pitch
+           * will increase. As a result, the height of the crossbar (area.h) will increase.
+           */
+
+        TechnologyParameter::DeviceType *deviceType;
+    double TriS1, TriS2;
+    double min_w_pmos, Vdd;
+
+};
+
+
+
+
+#endif
diff --git a/ext/mcpat/cacti/decoder.cc b/ext/mcpat/cacti/decoder.cc
new file mode 100644 (file)
index 0000000..0de6f61
--- /dev/null
@@ -0,0 +1,1577 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "area.h"
+#include "decoder.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+Decoder::Decoder(
+    int    _num_dec_signals,
+    bool   flag_way_select,
+    double _C_ld_dec_out,
+    double _R_wire_dec_out,
+    bool   fully_assoc_,
+    bool   is_dram_,
+    bool   is_wl_tr_,
+    const  Area & cell_)
+:exist(false),
+  C_ld_dec_out(_C_ld_dec_out),
+  R_wire_dec_out(_R_wire_dec_out),
+  num_gates(0), num_gates_min(2),
+  delay(0),
+  //power(),
+  fully_assoc(fully_assoc_), is_dram(is_dram_),
+  is_wl_tr(is_wl_tr_), cell(cell_)
+{
+
+  for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
+  {
+    w_dec_n[i] = 0;
+    w_dec_p[i] = 0;
+  }
+
+  /*
+   * _num_dec_signals is the number of decoded signal as output
+   * num_addr_bits_dec is the number of signal to be decoded
+   * as the decoders input.
+   */
+  int num_addr_bits_dec = _log2(_num_dec_signals);
+
+  if (num_addr_bits_dec < 4)
+  {
+    if (flag_way_select)
+    {
+      exist = true;
+      num_in_signals = 2;
+    }
+    else
+    {
+      num_in_signals = 0;
+    }
+  }
+  else
+  {
+    exist = true;
+
+    if (flag_way_select)
+    {
+      num_in_signals = 3;
+    }
+    else
+    {
+      num_in_signals = 2;
+    }
+  }
+
+  assert(cell.h>0);
+  assert(cell.w>0);
+  // the height of a row-decoder-driver cell is fixed to be 4 * cell.h;
+  //area.h = 4 * cell.h;
+  area.h = g_tp.h_dec * cell.h;
+
+  compute_widths();
+  compute_area();
+}
+
+
+
+void Decoder::compute_widths()
+{
+  double F;
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram, is_wl_tr);
+  double gnand2     = (2 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+  double gnand3     = (3 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+
+  if (exist)
+  {
+    if (num_in_signals == 2 || fully_assoc)
+    {
+      w_dec_n[0] = 2 * g_tp.min_w_nmos_;
+      w_dec_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand2;
+    }
+    else
+    {
+      w_dec_n[0] = 3 * g_tp.min_w_nmos_;
+      w_dec_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand3;
+    }
+
+    F *= C_ld_dec_out / (gate_C(w_dec_n[0], 0, is_dram, false, is_wl_tr) +
+                         gate_C(w_dec_p[0], 0, is_dram, false, is_wl_tr));
+    num_gates = logical_effort(
+        num_gates_min,
+        num_in_signals == 2 ? gnand2 : gnand3,
+        F,
+        w_dec_n,
+        w_dec_p,
+        C_ld_dec_out,
+        p_to_n_sz_ratio,
+        is_dram,
+        is_wl_tr,
+        g_tp.max_w_nmos_dec);
+  }
+}
+
+
+
+void Decoder::compute_area()
+{
+  double cumulative_area = 0;
+  double cumulative_curr = 0;  // cumulative leakage current
+  double cumulative_curr_Ig = 0;  // cumulative leakage current
+
+  if (exist)
+  { // First check if this decoder exists
+    if (num_in_signals == 2)
+    {
+      cumulative_area = compute_gate_area(NAND, 2, w_dec_p[0], w_dec_n[0], area.h);
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+    }
+    else if (num_in_signals == 3)
+    {
+      cumulative_area = compute_gate_area(NAND, 3, w_dec_p[0], w_dec_n[0], area.h);
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);;
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);
+    }
+
+    for (int i = 1; i < num_gates; i++)
+    {
+      cumulative_area += compute_gate_area(INV, 1, w_dec_p[i], w_dec_n[i], area.h);
+      cumulative_curr += cmos_Isub_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+    }
+    power.readOp.leakage = cumulative_curr * g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage = cumulative_curr_Ig * g_tp.peri_global.Vdd;
+
+    area.w = (cumulative_area / area.h);
+  }
+}
+
+
+
+double Decoder::compute_delays(double inrisetime)
+{
+  if (exist)
+  {
+    double ret_val = 0;  // outrisetime
+    int    i;
+    double rd, tf, this_delay, c_load, c_intrinsic, Vpp;
+    double Vdd = g_tp.peri_global.Vdd;
+
+    if ((is_wl_tr) && (is_dram))
+    {
+      Vpp = g_tp.vpp;
+    }
+    else if (is_wl_tr)
+    {
+      Vpp = g_tp.sram_cell.Vdd;
+    }
+    else
+    {
+      Vpp = g_tp.peri_global.Vdd;
+    }
+
+    // first check whether a decoder is required at all
+    rd = tr_R_on(w_dec_n[0], NCH, num_in_signals, is_dram, false, is_wl_tr);
+    c_load = gate_C(w_dec_n[1] + w_dec_p[1], 0.0, is_dram, false, is_wl_tr);
+    c_intrinsic = drain_C_(w_dec_p[0], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) * num_in_signals +
+                  drain_C_(w_dec_n[0], NCH, num_in_signals, 1, area.h, is_dram, false, is_wl_tr);
+    tf = rd * (c_intrinsic + c_load);
+    this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+    delay += this_delay;
+    inrisetime = this_delay / (1.0 - 0.5);
+    power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
+
+    for (i = 1; i < num_gates - 1; ++i)
+    {
+      rd = tr_R_on(w_dec_n[i], NCH, 1, is_dram, false, is_wl_tr);
+      c_load = gate_C(w_dec_p[i+1] + w_dec_n[i+1], 0.0, is_dram, false, is_wl_tr);
+      c_intrinsic = drain_C_(w_dec_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
+                    drain_C_(w_dec_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
+      tf = rd * (c_intrinsic + c_load);
+      this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+      delay += this_delay;
+      inrisetime = this_delay / (1.0 - 0.5);
+      power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
+    }
+
+    // add delay of final inverter that drives the wordline
+    i = num_gates - 1;
+    c_load = C_ld_dec_out;
+    rd = tr_R_on(w_dec_n[i], NCH, 1, is_dram, false, is_wl_tr);
+    c_intrinsic = drain_C_(w_dec_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
+                  drain_C_(w_dec_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
+    tf = rd * (c_intrinsic + c_load) + R_wire_dec_out * c_load / 2;
+    this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+    delay  += this_delay;
+    ret_val = this_delay / (1.0 - 0.5);
+    power.readOp.dynamic += c_load * Vpp * Vpp + c_intrinsic * Vdd * Vdd;
+
+    return ret_val;
+  }
+  else
+  {
+    return 0.0;
+  }
+}
+
+void Decoder::leakage_feedback(double temperature)
+{
+  double cumulative_curr = 0;  // cumulative leakage current
+  double cumulative_curr_Ig = 0;  // cumulative leakage current
+
+  if (exist)
+  { // First check if this decoder exists
+    if (num_in_signals == 2)
+    {
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+    }
+    else if (num_in_signals == 3)
+    {
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);;
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);
+    }
+
+    for (int i = 1; i < num_gates; i++)
+    {
+      cumulative_curr += cmos_Isub_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+    }
+
+    power.readOp.leakage = cumulative_curr * g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage = cumulative_curr_Ig * g_tp.peri_global.Vdd;
+  }
+}
+
+PredecBlk::PredecBlk(
+    int    num_dec_signals,
+    Decoder * dec_,
+    double C_wire_predec_blk_out,
+    double R_wire_predec_blk_out_,
+    int    num_dec_per_predec,
+    bool   is_dram,
+    bool   is_blk1)
+ :dec(dec_),
+  exist(false),
+  number_input_addr_bits(0),
+  C_ld_predec_blk_out(0),
+  R_wire_predec_blk_out(0),
+  branch_effort_nand2_gate_output(1),
+  branch_effort_nand3_gate_output(1),
+  flag_two_unique_paths(false),
+  flag_L2_gate(0),
+  number_inputs_L1_gate(0),
+  number_gates_L1_nand2_path(0),
+  number_gates_L1_nand3_path(0),
+  number_gates_L2(0),
+  min_number_gates_L1(2),
+  min_number_gates_L2(2),
+  num_L1_active_nand2_path(0),
+  num_L1_active_nand3_path(0),
+  delay_nand2_path(0),
+  delay_nand3_path(0),
+  power_nand2_path(),
+  power_nand3_path(),
+  power_L2(),
+  is_dram_(is_dram)
+{
+  int    branch_effort_predec_out;
+  double C_ld_dec_gate;
+  int    num_addr_bits_dec = _log2(num_dec_signals);
+  int    blk1_num_input_addr_bits = (num_addr_bits_dec + 1) / 2;
+  int    blk2_num_input_addr_bits = num_addr_bits_dec - blk1_num_input_addr_bits;
+
+  w_L1_nand2_n[0] = 0;
+  w_L1_nand2_p[0] = 0;
+  w_L1_nand3_n[0] = 0;
+  w_L1_nand3_p[0] = 0;
+
+  if (is_blk1 == true)
+  {
+    if (num_addr_bits_dec <= 0)
+    {
+      return;
+    }
+    else if (num_addr_bits_dec < 4)
+    {
+      // Just one predecoder block is required with NAND2 gates. No decoder required.
+      // The first level of predecoding directly drives the decoder output load
+      exist = true;
+      number_input_addr_bits = num_addr_bits_dec;
+      R_wire_predec_blk_out = dec->R_wire_dec_out;
+      C_ld_predec_blk_out = dec->C_ld_dec_out;
+    }
+    else
+    {
+      exist = true;
+      number_input_addr_bits   = blk1_num_input_addr_bits;
+      branch_effort_predec_out = (1 << blk2_num_input_addr_bits);
+      C_ld_dec_gate = num_dec_per_predec * gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_, false, false);
+      R_wire_predec_blk_out = R_wire_predec_blk_out_;
+      C_ld_predec_blk_out = branch_effort_predec_out * C_ld_dec_gate + C_wire_predec_blk_out;
+    }
+  }
+  else
+  {
+    if (num_addr_bits_dec >= 4)
+    {
+      exist = true;
+      number_input_addr_bits   = blk2_num_input_addr_bits;
+      branch_effort_predec_out = (1 << blk1_num_input_addr_bits);
+      C_ld_dec_gate = num_dec_per_predec * gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_, false, false);
+      R_wire_predec_blk_out = R_wire_predec_blk_out_;
+      C_ld_predec_blk_out = branch_effort_predec_out * C_ld_dec_gate + C_wire_predec_blk_out;
+    }
+  }
+
+  compute_widths();
+  compute_area();
+}
+
+
+
+void PredecBlk::compute_widths()
+{
+  double F, c_load_nand3_path, c_load_nand2_path;
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_);
+  double gnand2 = (2 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+  double gnand3 = (3 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+
+  if (exist == false) return;
+
+
+  switch (number_input_addr_bits)
+  {
+    case 1:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 2;
+      flag_L2_gate                    = 0;
+      break;
+    case 2:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 2;
+      flag_L2_gate                    = 0;
+      break;
+    case 3:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 3;
+      flag_L2_gate                    = 0;
+      break;
+    case 4:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 2;
+      flag_L2_gate                    = 2;
+      branch_effort_nand2_gate_output = 4;
+      break;
+    case 5:
+      flag_two_unique_paths           = true;
+      flag_L2_gate                    = 2;
+      branch_effort_nand2_gate_output = 8;
+      branch_effort_nand3_gate_output = 4;
+      break;
+    case 6:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 3;
+      flag_L2_gate                    = 2;
+      branch_effort_nand3_gate_output = 8;
+      break;
+    case 7:
+      flag_two_unique_paths           = true;
+      flag_L2_gate                    = 3;
+      branch_effort_nand2_gate_output = 32;
+      branch_effort_nand3_gate_output = 16;
+      break;
+    case 8:
+      flag_two_unique_paths           = true;
+      flag_L2_gate                    = 3;
+      branch_effort_nand2_gate_output = 64;
+      branch_effort_nand3_gate_output = 32;
+      break;
+    case 9:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 3;
+      flag_L2_gate                    = 3;
+      branch_effort_nand3_gate_output = 64;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // find the number of gates and sizing in second level of predecoder (if there is a second level)
+  if (flag_L2_gate)
+  {
+    if (flag_L2_gate == 2)
+    { // 2nd level is a NAND2 gate
+      w_L2_n[0] = 2 * g_tp.min_w_nmos_;
+      F = gnand2;
+    }
+    else
+    { // 2nd level is a NAND3 gate
+      w_L2_n[0] = 3 * g_tp.min_w_nmos_;
+      F = gnand3;
+    }
+    w_L2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+    F *= C_ld_predec_blk_out / (gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_));
+    number_gates_L2 = logical_effort(
+        min_number_gates_L2,
+        flag_L2_gate == 2 ? gnand2 : gnand3,
+        F,
+        w_L2_n,
+        w_L2_p,
+        C_ld_predec_blk_out,
+        p_to_n_sz_ratio,
+        is_dram_, false,
+        g_tp.max_w_nmos_);
+
+    // Now find the number of gates and widths in first level of predecoder
+    if ((flag_two_unique_paths)||(number_inputs_L1_gate == 2))
+    { // Whenever flag_two_unique_paths is true, it means first level of decoder employs
+      // both NAND2 and NAND3 gates. Or when number_inputs_L1_gate is 2, it means
+      // a NAND2 gate is used in the first level of the predecoder
+      c_load_nand2_path = branch_effort_nand2_gate_output *
+        (gate_C(w_L2_n[0], 0, is_dram_) +
+         gate_C(w_L2_p[0], 0, is_dram_));
+      w_L1_nand2_n[0] = 2 * g_tp.min_w_nmos_;
+      w_L1_nand2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand2 * c_load_nand2_path /
+        (gate_C(w_L1_nand2_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand2_p[0], 0, is_dram_));
+      number_gates_L1_nand2_path = logical_effort(
+          min_number_gates_L1,
+          gnand2,
+          F,
+          w_L1_nand2_n,
+          w_L1_nand2_p,
+          c_load_nand2_path,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+
+    //Now find widths of gates along path in which first gate is a NAND3
+    if ((flag_two_unique_paths)||(number_inputs_L1_gate == 3))
+    { // Whenever flag_two_unique_paths is TRUE, it means first level of decoder employs
+      // both NAND2 and NAND3 gates. Or when number_inputs_L1_gate is 3, it means
+      // a NAND3 gate is used in the first level of the predecoder
+      c_load_nand3_path = branch_effort_nand3_gate_output *
+        (gate_C(w_L2_n[0], 0, is_dram_) +
+         gate_C(w_L2_p[0], 0, is_dram_));
+      w_L1_nand3_n[0] = 3 * g_tp.min_w_nmos_;
+      w_L1_nand3_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand3 * c_load_nand3_path /
+        (gate_C(w_L1_nand3_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand3_p[0], 0, is_dram_));
+      number_gates_L1_nand3_path = logical_effort(
+          min_number_gates_L1,
+          gnand3,
+          F,
+          w_L1_nand3_n,
+          w_L1_nand3_p,
+          c_load_nand3_path,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+  }
+  else
+  { // find number of gates and widths in first level of predecoder block when there is no second level
+    if (number_inputs_L1_gate == 2)
+    {
+      w_L1_nand2_n[0] = 2 * g_tp.min_w_nmos_;
+      w_L1_nand2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand2*C_ld_predec_blk_out /
+        (gate_C(w_L1_nand2_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand2_p[0], 0, is_dram_));
+      number_gates_L1_nand2_path = logical_effort(
+          min_number_gates_L1,
+          gnand2,
+          F,
+          w_L1_nand2_n,
+          w_L1_nand2_p,
+          C_ld_predec_blk_out,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+    else if (number_inputs_L1_gate == 3)
+    {
+      w_L1_nand3_n[0] = 3 * g_tp.min_w_nmos_;
+      w_L1_nand3_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand3*C_ld_predec_blk_out /
+        (gate_C(w_L1_nand3_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand3_p[0], 0, is_dram_));
+      number_gates_L1_nand3_path = logical_effort(
+          min_number_gates_L1,
+          gnand3,
+          F,
+          w_L1_nand3_n,
+          w_L1_nand3_p,
+          C_ld_predec_blk_out,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+  }
+}
+
+
+
+void PredecBlk::compute_area()
+{
+  if (exist)
+  { // First check whether a predecoder block is needed
+    int num_L1_nand2 = 0;
+    int num_L1_nand3 = 0;
+    int num_L2 = 0;
+    double tot_area_L1_nand3  =0;
+    double leak_L1_nand3      =0;
+    double gate_leak_L1_nand3 =0;
+
+    double tot_area_L1_nand2  = compute_gate_area(NAND, 2, w_L1_nand2_p[0], w_L1_nand2_n[0], g_tp.cell_h_def);
+    double leak_L1_nand2      = cmos_Isub_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    double gate_leak_L1_nand2 = cmos_Ig_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    if (number_inputs_L1_gate != 3) {
+      tot_area_L1_nand3 = 0;
+      leak_L1_nand3 = 0;
+      gate_leak_L1_nand3 =0;
+    }
+    else {
+      tot_area_L1_nand3  = compute_gate_area(NAND, 3, w_L1_nand3_p[0], w_L1_nand3_n[0], g_tp.cell_h_def);
+      leak_L1_nand3      = cmos_Isub_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+      gate_leak_L1_nand3 = cmos_Ig_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+    }
+
+    switch (number_input_addr_bits)
+    {
+      case 1: //2 NAND2 gates
+        num_L1_nand2 = 2;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 2: //4 NAND2 gates
+        num_L1_nand2 = 4;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 3: //8 NAND3 gates
+        num_L1_nand3 = 8;
+        num_L2       = 0;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =1;
+        break;
+      case 4: //4 + 4 NAND2 gates
+        num_L1_nand2 = 8;
+        num_L2       = 16;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =0;
+        break;
+      case 5: //4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 8;
+        num_L2       = 32;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =1;
+        break;
+      case 6: //8 + 8 NAND3 gates
+        num_L1_nand3 = 16;
+        num_L2       = 64;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =2;
+        break;
+      case 7: //4 + 4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 8;
+        num_L1_nand3 = 8;
+        num_L2       = 128;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =1;
+        break;
+      case 8: //4 NAND2 gates, 8 + 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 16;
+        num_L2       = 256;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =2;
+        break;
+      case 9: //8 + 8 + 8 NAND3 gates
+        num_L1_nand3 = 24;
+        num_L2       = 512;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =3;
+        break;
+      default:
+        break;
+    }
+
+    for (int i = 1; i < number_gates_L1_nand2_path; ++i)
+    {
+      tot_area_L1_nand2  += compute_gate_area(INV, 1, w_L1_nand2_p[i], w_L1_nand2_n[i], g_tp.cell_h_def);
+      leak_L1_nand2      += cmos_Isub_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+      gate_leak_L1_nand2 += cmos_Ig_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+    }
+    tot_area_L1_nand2  *= num_L1_nand2;
+    leak_L1_nand2      *= num_L1_nand2;
+    gate_leak_L1_nand2 *= num_L1_nand2;
+
+    for (int i = 1; i < number_gates_L1_nand3_path; ++i)
+    {
+      tot_area_L1_nand3  += compute_gate_area(INV, 1, w_L1_nand3_p[i], w_L1_nand3_n[i], g_tp.cell_h_def);
+      leak_L1_nand3      += cmos_Isub_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+      gate_leak_L1_nand3 += cmos_Ig_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+    }
+    tot_area_L1_nand3  *= num_L1_nand3;
+    leak_L1_nand3      *= num_L1_nand3;
+    gate_leak_L1_nand3 *= num_L1_nand3;
+
+    double cumulative_area_L1 = tot_area_L1_nand2 + tot_area_L1_nand3;
+    double cumulative_area_L2 = 0.0;
+    double leakage_L2         = 0.0;
+    double gate_leakage_L2    = 0.0;
+
+    if (flag_L2_gate == 2)
+    {
+      cumulative_area_L2 = compute_gate_area(NAND, 2, w_L2_p[0], w_L2_n[0], g_tp.cell_h_def);
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+    }
+    else if (flag_L2_gate == 3)
+    {
+      cumulative_area_L2 = compute_gate_area(NAND, 3, w_L2_p[0], w_L2_n[0], g_tp.cell_h_def);
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+    }
+
+    for (int i = 1; i < number_gates_L2; ++i)
+    {
+      cumulative_area_L2 += compute_gate_area(INV, 1, w_L2_p[i], w_L2_n[i], g_tp.cell_h_def);
+      leakage_L2         += cmos_Isub_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+      gate_leakage_L2    += cmos_Ig_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+    }
+    cumulative_area_L2 *= num_L2;
+    leakage_L2         *= num_L2;
+    gate_leakage_L2    *= num_L2;
+
+    power_nand2_path.readOp.leakage = leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.leakage         = leakage_L2    * g_tp.peri_global.Vdd;
+    area.set_area(cumulative_area_L1 + cumulative_area_L2);
+    power_nand2_path.readOp.gate_leakage = gate_leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.gate_leakage         = gate_leakage_L2    * g_tp.peri_global.Vdd;
+  }
+}
+
+
+
+pair<double, double> PredecBlk::compute_delays(
+    pair<double, double> inrisetime)  // <nand2, nand3>
+{
+  pair<double, double> ret_val;
+  ret_val.first  = 0;  // outrisetime_nand2_path
+  ret_val.second = 0;  // outrisetime_nand3_path
+
+  double inrisetime_nand2_path = inrisetime.first;
+  double inrisetime_nand3_path = inrisetime.second;
+  int    i;
+  double rd, c_load, c_intrinsic, tf, this_delay;
+  double Vdd = g_tp.peri_global.Vdd;
+
+  // TODO: following delay calculation part can be greatly simplified.
+  // first check whether a predecoder block is required
+  if (exist)
+  {
+    //Find delay in first level of predecoder block
+    //First find delay in path
+    if ((flag_two_unique_paths) || (number_inputs_L1_gate == 2))
+    {
+      //First gate is a NAND2 gate
+      rd = tr_R_on(w_L1_nand2_n[0], NCH, 2, is_dram_);
+      c_load = gate_C(w_L1_nand2_n[1] + w_L1_nand2_p[1], 0.0, is_dram_);
+      c_intrinsic = 2 * drain_C_(w_L1_nand2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                        drain_C_(w_L1_nand2_n[0], NCH, 2, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_load);
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+      power_nand2_path.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
+
+      //Add delays of all but the last inverter in the chain
+      for (i = 1; i < number_gates_L1_nand2_path - 1; ++i)
+      {
+        rd = tr_R_on(w_L1_nand2_n[i], NCH, 1, is_dram_);
+        c_load = gate_C(w_L1_nand2_n[i+1] + w_L1_nand2_p[i+1], 0.0, is_dram_);
+        c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      //Add delay of the last inverter
+      i = number_gates_L1_nand2_path - 1;
+      rd = tr_R_on(w_L1_nand2_n[i], NCH, 1, is_dram_);
+      if (flag_L2_gate)
+      {
+        c_load = branch_effort_nand2_gate_output*(gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_));
+        c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+      else
+      { //First level directly drives decoder output load
+        c_load = C_ld_predec_blk_out;
+        c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2;
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        ret_val.first = this_delay / (1.0 - 0.5);
+        power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+    }
+
+    if ((flag_two_unique_paths) || (number_inputs_L1_gate == 3))
+    { //Check if the number of gates in the first level is more than 1.
+      //First gate is a NAND3 gate
+      rd = tr_R_on(w_L1_nand3_n[0], NCH, 3, is_dram_);
+      c_load = gate_C(w_L1_nand3_n[1] + w_L1_nand3_p[1], 0.0, is_dram_);
+      c_intrinsic = 3 * drain_C_(w_L1_nand3_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                        drain_C_(w_L1_nand3_n[0], NCH, 3, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_load);
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+      power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+
+      //Add delays of all but the last inverter in the chain
+      for (i = 1; i < number_gates_L1_nand3_path - 1; ++i)
+      {
+        rd = tr_R_on(w_L1_nand3_n[i], NCH, 1, is_dram_);
+        c_load = gate_C(w_L1_nand3_n[i+1] + w_L1_nand3_p[i+1], 0.0, is_dram_);
+        c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      //Add delay of the last inverter
+      i = number_gates_L1_nand3_path - 1;
+      rd = tr_R_on(w_L1_nand3_n[i], NCH, 1, is_dram_);
+      if (flag_L2_gate)
+      {
+        c_load = branch_effort_nand3_gate_output*(gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_));
+        c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+      else
+      { //First level directly drives decoder output load
+        c_load = C_ld_predec_blk_out;
+        c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2;
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        ret_val.second = this_delay / (1.0 - 0.5);
+        power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+    }
+
+    // Find delay through second level
+    if (flag_L2_gate)
+    {
+      if (flag_L2_gate == 2)
+      {
+        rd = tr_R_on(w_L2_n[0], NCH, 2, is_dram_);
+        c_load = gate_C(w_L2_n[1] + w_L2_p[1], 0.0, is_dram_);
+        c_intrinsic = 2 * drain_C_(w_L2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                          drain_C_(w_L2_n[0], NCH, 2, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+      else
+      { // flag_L2_gate = 3
+        rd = tr_R_on(w_L2_n[0], NCH, 3, is_dram_);
+        c_load = gate_C(w_L2_n[1] + w_L2_p[1], 0.0, is_dram_);
+        c_intrinsic = 3 * drain_C_(w_L2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                          drain_C_(w_L2_n[0], NCH, 3, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      for (i = 1; i < number_gates_L2 - 1; ++i)
+      {
+        rd = tr_R_on(w_L2_n[i], NCH, 1, is_dram_);
+        c_load = gate_C(w_L2_n[i+1] + w_L2_p[i+1], 0.0, is_dram_);
+        c_intrinsic = drain_C_(w_L2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      //Add delay of final inverter that drives the wordline decoders
+      i = number_gates_L2 - 1;
+      c_load = C_ld_predec_blk_out;
+      rd = tr_R_on(w_L2_n[i], NCH, 1, is_dram_);
+      c_intrinsic = drain_C_(w_L2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(w_L2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2;
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      ret_val.first = this_delay / (1.0 - 0.5);
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      ret_val.second = this_delay / (1.0 - 0.5);
+      power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+    }
+  }
+
+  delay = (ret_val.first > ret_val.second) ? ret_val.first : ret_val.second;
+  return ret_val;
+}
+
+void PredecBlk::leakage_feedback(double temperature)
+{
+  if (exist)
+  { // First check whether a predecoder block is needed
+    int num_L1_nand2 = 0;
+    int num_L1_nand3 = 0;
+    int num_L2 = 0;
+    double leak_L1_nand3      =0;
+    double gate_leak_L1_nand3 =0;
+
+    double leak_L1_nand2      = cmos_Isub_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    double gate_leak_L1_nand2 = cmos_Ig_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    if (number_inputs_L1_gate != 3) {
+      leak_L1_nand3 = 0;
+      gate_leak_L1_nand3 =0;
+    }
+    else {
+      leak_L1_nand3      = cmos_Isub_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+      gate_leak_L1_nand3 = cmos_Ig_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+    }
+
+    switch (number_input_addr_bits)
+    {
+      case 1: //2 NAND2 gates
+        num_L1_nand2 = 2;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 2: //4 NAND2 gates
+        num_L1_nand2 = 4;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 3: //8 NAND3 gates
+        num_L1_nand3 = 8;
+        num_L2       = 0;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =1;
+        break;
+      case 4: //4 + 4 NAND2 gates
+        num_L1_nand2 = 8;
+        num_L2       = 16;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =0;
+        break;
+      case 5: //4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 8;
+        num_L2       = 32;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =1;
+        break;
+      case 6: //8 + 8 NAND3 gates
+        num_L1_nand3 = 16;
+        num_L2       = 64;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =2;
+        break;
+      case 7: //4 + 4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 8;
+        num_L1_nand3 = 8;
+        num_L2       = 128;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =1;
+        break;
+      case 8: //4 NAND2 gates, 8 + 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 16;
+        num_L2       = 256;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =2;
+        break;
+      case 9: //8 + 8 + 8 NAND3 gates
+        num_L1_nand3 = 24;
+        num_L2       = 512;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =3;
+        break;
+      default:
+        break;
+    }
+
+    for (int i = 1; i < number_gates_L1_nand2_path; ++i)
+    {
+      leak_L1_nand2      += cmos_Isub_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+      gate_leak_L1_nand2 += cmos_Ig_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+    }
+    leak_L1_nand2      *= num_L1_nand2;
+    gate_leak_L1_nand2 *= num_L1_nand2;
+
+    for (int i = 1; i < number_gates_L1_nand3_path; ++i)
+    {
+      leak_L1_nand3      += cmos_Isub_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+      gate_leak_L1_nand3 += cmos_Ig_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+    }
+    leak_L1_nand3      *= num_L1_nand3;
+    gate_leak_L1_nand3 *= num_L1_nand3;
+
+    double leakage_L2         = 0.0;
+    double gate_leakage_L2    = 0.0;
+
+    if (flag_L2_gate == 2)
+    {
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+    }
+    else if (flag_L2_gate == 3)
+    {
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+    }
+
+    for (int i = 1; i < number_gates_L2; ++i)
+    {
+      leakage_L2         += cmos_Isub_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+      gate_leakage_L2    += cmos_Ig_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+    }
+    leakage_L2         *= num_L2;
+    gate_leakage_L2    *= num_L2;
+
+    power_nand2_path.readOp.leakage = leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.leakage         = leakage_L2    * g_tp.peri_global.Vdd;
+
+    power_nand2_path.readOp.gate_leakage = gate_leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.gate_leakage         = gate_leakage_L2    * g_tp.peri_global.Vdd;
+  }
+}
+
+PredecBlkDrv::PredecBlkDrv(
+    int    way_select_,
+    PredecBlk * blk_,
+    bool   is_dram)
+ :flag_driver_exists(0),
+  number_gates_nand2_path(0),
+  number_gates_nand3_path(0),
+  min_number_gates(2),
+  num_buffers_driving_1_nand2_load(0),
+  num_buffers_driving_2_nand2_load(0),
+  num_buffers_driving_4_nand2_load(0),
+  num_buffers_driving_2_nand3_load(0),
+  num_buffers_driving_8_nand3_load(0),
+  num_buffers_nand3_path(0),
+  c_load_nand2_path_out(0),
+  c_load_nand3_path_out(0),
+  r_load_nand2_path_out(0),
+  r_load_nand3_path_out(0),
+  delay_nand2_path(0),
+  delay_nand3_path(0),
+  power_nand2_path(),
+  power_nand3_path(),
+  blk(blk_), dec(blk->dec),
+  is_dram_(is_dram),
+  way_select(way_select_)
+{
+  for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
+  {
+    width_nand2_path_n[i] = 0;
+    width_nand2_path_p[i] = 0;
+    width_nand3_path_n[i] = 0;
+    width_nand3_path_p[i] = 0;
+  }
+
+  number_input_addr_bits = blk->number_input_addr_bits;
+
+  if (way_select > 1)
+  {
+    flag_driver_exists     = 1;
+    number_input_addr_bits = way_select;
+    if (dec->num_in_signals == 2)
+    {
+      c_load_nand2_path_out = gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_);
+      num_buffers_driving_2_nand2_load = number_input_addr_bits;
+    }
+    else if (dec->num_in_signals == 3)
+    {
+      c_load_nand3_path_out = gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_);
+      num_buffers_driving_2_nand3_load = number_input_addr_bits;
+    }
+  }
+  else if (way_select == 0)
+  {
+    if (blk->exist)
+    {
+      flag_driver_exists = 1;
+    }
+  }
+
+  compute_widths();
+  compute_area();
+}
+
+
+
+void PredecBlkDrv::compute_widths()
+{
+  // The predecode block driver accepts as input the address bits from the h-tree network. For
+  // each addr bit it then generates addr and addrbar as outputs. For now ignore the effect of
+  // inversion to generate addrbar and simply treat addrbar as addr.
+
+  double F;
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_);
+
+  if (flag_driver_exists)
+  {
+    double C_nand2_gate_blk = gate_C(blk->w_L1_nand2_n[0] + blk->w_L1_nand2_p[0], 0, is_dram_);
+    double C_nand3_gate_blk = gate_C(blk->w_L1_nand3_n[0] + blk->w_L1_nand3_p[0], 0, is_dram_);
+
+    if (way_select == 0)
+    {
+      if (blk->number_input_addr_bits == 1)
+      { //2 NAND2 gates
+        num_buffers_driving_2_nand2_load = 1;
+        c_load_nand2_path_out            = 2 * C_nand2_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 2)
+      { //4 NAND2 gates  one 2-4 decoder
+        num_buffers_driving_4_nand2_load = 2;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 3)
+      { //8 NAND3 gates  one 3-8 decoder
+        num_buffers_driving_8_nand3_load = 3;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 4)
+      { //4 + 4 NAND2 gates two 2-4 decoder
+        num_buffers_driving_4_nand2_load = 4;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 5)
+      { //4 NAND2 gates, 8 NAND3 gates one 2-4 decoder and one 3-8 decoder
+        num_buffers_driving_4_nand2_load = 2;
+        num_buffers_driving_8_nand3_load = 3;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 6)
+      { //8 + 8 NAND3 gates two 3-8 decoder
+        num_buffers_driving_8_nand3_load = 6;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 7)
+      { //4 + 4 NAND2 gates, 8 NAND3 gates two 2-4 decoder and one 3-8 decoder
+        num_buffers_driving_4_nand2_load = 4;
+        num_buffers_driving_8_nand3_load = 3;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 8)
+      { //4 NAND2 gates, 8 + 8 NAND3 gates one 2-4 decoder and two 3-8 decoder
+        num_buffers_driving_4_nand2_load = 2;
+        num_buffers_driving_8_nand3_load = 6;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 9)
+      { //8 + 8 + 8 NAND3 gates three 3-8 decoder
+        num_buffers_driving_8_nand3_load = 9;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+    }
+
+    if ((blk->flag_two_unique_paths) ||
+        (blk->number_inputs_L1_gate == 2) ||
+        (number_input_addr_bits == 0) ||
+        ((way_select)&&(dec->num_in_signals == 2)))
+    { //this means that way_select is driving NAND2 in decoder.
+      width_nand2_path_n[0] = g_tp.min_w_nmos_;
+      width_nand2_path_p[0] = p_to_n_sz_ratio * width_nand2_path_n[0];
+      F = c_load_nand2_path_out / gate_C(width_nand2_path_n[0] + width_nand2_path_p[0], 0, is_dram_);
+      number_gates_nand2_path = logical_effort(
+          min_number_gates,
+          1,
+          F,
+          width_nand2_path_n,
+          width_nand2_path_p,
+          c_load_nand2_path_out,
+          p_to_n_sz_ratio,
+          is_dram_, false, g_tp.max_w_nmos_);
+    }
+
+    if ((blk->flag_two_unique_paths) ||
+        (blk->number_inputs_L1_gate == 3) ||
+        ((way_select)&&(dec->num_in_signals == 3)))
+    { //this means that way_select is driving NAND3 in decoder.
+      width_nand3_path_n[0] = g_tp.min_w_nmos_;
+      width_nand3_path_p[0] = p_to_n_sz_ratio * width_nand3_path_n[0];
+      F = c_load_nand3_path_out / gate_C(width_nand3_path_n[0] + width_nand3_path_p[0], 0, is_dram_);
+      number_gates_nand3_path = logical_effort(
+          min_number_gates,
+          1,
+          F,
+          width_nand3_path_n,
+          width_nand3_path_p,
+          c_load_nand3_path_out,
+          p_to_n_sz_ratio,
+          is_dram_, false, g_tp.max_w_nmos_);
+    }
+  }
+}
+
+
+
+void PredecBlkDrv::compute_area()
+{
+  double area_nand2_path = 0;
+  double area_nand3_path = 0;
+  double leak_nand2_path = 0;
+  double leak_nand3_path = 0;
+  double gate_leak_nand2_path = 0;
+  double gate_leak_nand3_path = 0;
+
+  if (flag_driver_exists)
+  { // first check whether a predecoder block driver is needed
+    for (int i = 0; i < number_gates_nand2_path; ++i)
+    {
+      area_nand2_path += compute_gate_area(INV, 1, width_nand2_path_p[i], width_nand2_path_n[i], g_tp.cell_h_def);
+      leak_nand2_path += cmos_Isub_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand2_path += cmos_Ig_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+    }
+    area_nand2_path *= (num_buffers_driving_1_nand2_load +
+                        num_buffers_driving_2_nand2_load +
+                        num_buffers_driving_4_nand2_load);
+    leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                        num_buffers_driving_2_nand2_load +
+                        num_buffers_driving_4_nand2_load);
+    gate_leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                            num_buffers_driving_2_nand2_load +
+                            num_buffers_driving_4_nand2_load);
+
+    for (int i = 0; i < number_gates_nand3_path; ++i)
+    {
+      area_nand3_path += compute_gate_area(INV, 1, width_nand3_path_p[i], width_nand3_path_n[i], g_tp.cell_h_def);
+      leak_nand3_path += cmos_Isub_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand3_path += cmos_Ig_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+    }
+    area_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+    leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+    gate_leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+
+    power_nand2_path.readOp.leakage = leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_nand3_path * g_tp.peri_global.Vdd;
+    power_nand2_path.readOp.gate_leakage = gate_leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_nand3_path * g_tp.peri_global.Vdd;
+    area.set_area(area_nand2_path + area_nand3_path);
+  }
+}
+
+
+
+pair<double, double> PredecBlkDrv::compute_delays(
+    double inrisetime_nand2_path,
+    double inrisetime_nand3_path)
+{
+  pair<double, double> ret_val;
+  ret_val.first  = 0;  // outrisetime_nand2_path
+  ret_val.second = 0;  // outrisetime_nand3_path
+  int i;
+  double rd, c_gate_load, c_load, c_intrinsic, tf, this_delay;
+  double Vdd = g_tp.peri_global.Vdd;
+
+  if (flag_driver_exists)
+  {
+    for (i = 0; i < number_gates_nand2_path - 1; ++i)
+    {
+      rd = tr_R_on(width_nand2_path_n[i], NCH, 1, is_dram_);
+      c_gate_load = gate_C(width_nand2_path_p[i+1] + width_nand2_path_n[i+1], 0.0, is_dram_);
+      c_intrinsic = drain_C_(width_nand2_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand2_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_gate_load);
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+      power_nand2_path.readOp.dynamic += (c_gate_load + c_intrinsic) * 0.5 * Vdd * Vdd;
+    }
+
+    // Final inverter drives the predecoder block or the decoder output load
+    if (number_gates_nand2_path != 0)
+    {
+      i = number_gates_nand2_path - 1;
+      rd = tr_R_on(width_nand2_path_n[i], NCH, 1, is_dram_);
+      c_intrinsic = drain_C_(width_nand2_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand2_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      c_load = c_load_nand2_path_out;
+      tf = rd * (c_intrinsic + c_load) + r_load_nand2_path_out*c_load/ 2;
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      ret_val.first = this_delay / (1.0 - 0.5);
+      power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * 0.5 * Vdd * Vdd;
+//      cout<< "c_intrinsic = " << c_intrinsic << "c_load" << c_load <<endl;
+    }
+
+    for (i = 0; i < number_gates_nand3_path - 1; ++i)
+    {
+      rd = tr_R_on(width_nand3_path_n[i], NCH, 1, is_dram_);
+      c_gate_load = gate_C(width_nand3_path_p[i+1] + width_nand3_path_n[i+1], 0.0, is_dram_);
+      c_intrinsic = drain_C_(width_nand3_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand3_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_gate_load);
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+      power_nand3_path.readOp.dynamic += (c_gate_load + c_intrinsic) * 0.5 * Vdd * Vdd;
+    }
+
+    // Final inverter drives the predecoder block or the decoder output load
+    if (number_gates_nand3_path != 0)
+    {
+      i = number_gates_nand3_path - 1;
+      rd = tr_R_on(width_nand3_path_n[i], NCH, 1, is_dram_);
+      c_intrinsic = drain_C_(width_nand3_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand3_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      c_load = c_load_nand3_path_out;
+      tf = rd*(c_intrinsic + c_load) + r_load_nand3_path_out*c_load / 2;
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      ret_val.second = this_delay / (1.0 - 0.5);
+      power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * 0.5 * Vdd * Vdd;
+    }
+  }
+  return ret_val;
+}
+
+
+double PredecBlkDrv::get_rdOp_dynamic_E(int num_act_mats_hor_dir)
+{
+  return (num_addr_bits_nand2_path()*power_nand2_path.readOp.dynamic +
+          num_addr_bits_nand3_path()*power_nand3_path.readOp.dynamic) * num_act_mats_hor_dir;
+}
+
+
+
+Predec::Predec(
+    PredecBlkDrv * drv1_,
+    PredecBlkDrv * drv2_)
+:blk1(drv1_->blk), blk2(drv2_->blk), drv1(drv1_), drv2(drv2_)
+{
+  driver_power.readOp.leakage = drv1->power_nand2_path.readOp.leakage +
+                                drv1->power_nand3_path.readOp.leakage +
+                                drv2->power_nand2_path.readOp.leakage +
+                                drv2->power_nand3_path.readOp.leakage;
+  block_power.readOp.leakage = blk1->power_nand2_path.readOp.leakage +
+                               blk1->power_nand3_path.readOp.leakage +
+                               blk1->power_L2.readOp.leakage +
+                               blk2->power_nand2_path.readOp.leakage +
+                               blk2->power_nand3_path.readOp.leakage +
+                               blk2->power_L2.readOp.leakage;
+  power.readOp.leakage = driver_power.readOp.leakage + block_power.readOp.leakage;
+
+  driver_power.readOp.gate_leakage = drv1->power_nand2_path.readOp.gate_leakage +
+                                  drv1->power_nand3_path.readOp.gate_leakage +
+                                  drv2->power_nand2_path.readOp.gate_leakage +
+                                  drv2->power_nand3_path.readOp.gate_leakage;
+  block_power.readOp.gate_leakage = blk1->power_nand2_path.readOp.gate_leakage +
+                                 blk1->power_nand3_path.readOp.gate_leakage +
+                                 blk1->power_L2.readOp.gate_leakage +
+                                 blk2->power_nand2_path.readOp.gate_leakage +
+                                 blk2->power_nand3_path.readOp.gate_leakage +
+                                 blk2->power_L2.readOp.gate_leakage;
+  power.readOp.gate_leakage = driver_power.readOp.gate_leakage + block_power.readOp.gate_leakage;
+}
+
+void PredecBlkDrv::leakage_feedback(double temperature)
+{
+  double leak_nand2_path = 0;
+  double leak_nand3_path = 0;
+  double gate_leak_nand2_path = 0;
+  double gate_leak_nand3_path = 0;
+
+  if (flag_driver_exists)
+  { // first check whether a predecoder block driver is needed
+    for (int i = 0; i < number_gates_nand2_path; ++i)
+    {
+      leak_nand2_path += cmos_Isub_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand2_path += cmos_Ig_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+    }
+    leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                        num_buffers_driving_2_nand2_load +
+                        num_buffers_driving_4_nand2_load);
+    gate_leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                            num_buffers_driving_2_nand2_load +
+                            num_buffers_driving_4_nand2_load);
+
+    for (int i = 0; i < number_gates_nand3_path; ++i)
+    {
+      leak_nand3_path += cmos_Isub_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand3_path += cmos_Ig_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+    }
+    leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+    gate_leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+
+    power_nand2_path.readOp.leakage = leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_nand3_path * g_tp.peri_global.Vdd;
+    power_nand2_path.readOp.gate_leakage = gate_leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_nand3_path * g_tp.peri_global.Vdd;
+  }
+}
+
+double Predec::compute_delays(double inrisetime)
+{
+  // TODO: Jung Ho thinks that predecoder block driver locates between decoder and predecoder block.
+  pair<double, double> tmp_pair1, tmp_pair2;
+  tmp_pair1 = drv1->compute_delays(inrisetime, inrisetime);
+  tmp_pair1 = blk1->compute_delays(tmp_pair1);
+  tmp_pair2 = drv2->compute_delays(inrisetime, inrisetime);
+  tmp_pair2 = blk2->compute_delays(tmp_pair2);
+  tmp_pair1 = get_max_delay_before_decoder(tmp_pair1, tmp_pair2);
+
+  driver_power.readOp.dynamic =
+    drv1->num_addr_bits_nand2_path() * drv1->power_nand2_path.readOp.dynamic +
+    drv1->num_addr_bits_nand3_path() * drv1->power_nand3_path.readOp.dynamic +
+    drv2->num_addr_bits_nand2_path() * drv2->power_nand2_path.readOp.dynamic +
+    drv2->num_addr_bits_nand3_path() * drv2->power_nand3_path.readOp.dynamic;
+
+  block_power.readOp.dynamic =
+    blk1->power_nand2_path.readOp.dynamic*blk1->num_L1_active_nand2_path +
+    blk1->power_nand3_path.readOp.dynamic*blk1->num_L1_active_nand3_path +
+    blk1->power_L2.readOp.dynamic +
+    blk2->power_nand2_path.readOp.dynamic*blk1->num_L1_active_nand2_path  +
+    blk2->power_nand3_path.readOp.dynamic*blk1->num_L1_active_nand3_path +
+    blk2->power_L2.readOp.dynamic;
+
+  power.readOp.dynamic = driver_power.readOp.dynamic + block_power.readOp.dynamic;
+
+  delay = tmp_pair1.first;
+  return  tmp_pair1.second;
+}
+
+
+void Predec::leakage_feedback(double temperature)
+{
+  drv1->leakage_feedback(temperature);
+  drv2->leakage_feedback(temperature);
+  blk1->leakage_feedback(temperature);
+  blk2->leakage_feedback(temperature);
+
+  driver_power.readOp.leakage = drv1->power_nand2_path.readOp.leakage +
+                                drv1->power_nand3_path.readOp.leakage +
+                                drv2->power_nand2_path.readOp.leakage +
+                                drv2->power_nand3_path.readOp.leakage;
+  block_power.readOp.leakage = blk1->power_nand2_path.readOp.leakage +
+                               blk1->power_nand3_path.readOp.leakage +
+                               blk1->power_L2.readOp.leakage +
+                               blk2->power_nand2_path.readOp.leakage +
+                               blk2->power_nand3_path.readOp.leakage +
+                               blk2->power_L2.readOp.leakage;
+  power.readOp.leakage = driver_power.readOp.leakage + block_power.readOp.leakage;
+
+  driver_power.readOp.gate_leakage = drv1->power_nand2_path.readOp.gate_leakage +
+                                  drv1->power_nand3_path.readOp.gate_leakage +
+                                  drv2->power_nand2_path.readOp.gate_leakage +
+                                  drv2->power_nand3_path.readOp.gate_leakage;
+  block_power.readOp.gate_leakage = blk1->power_nand2_path.readOp.gate_leakage +
+                                 blk1->power_nand3_path.readOp.gate_leakage +
+                                 blk1->power_L2.readOp.gate_leakage +
+                                 blk2->power_nand2_path.readOp.gate_leakage +
+                                 blk2->power_nand3_path.readOp.gate_leakage +
+                                 blk2->power_L2.readOp.gate_leakage;
+  power.readOp.gate_leakage = driver_power.readOp.gate_leakage + block_power.readOp.gate_leakage;
+}
+
+// returns <delay, risetime>
+pair<double, double> Predec::get_max_delay_before_decoder(
+    pair<double, double> input_pair1,
+    pair<double, double> input_pair2)
+{
+  pair<double, double> ret_val;
+  double delay;
+
+  delay = drv1->delay_nand2_path + blk1->delay_nand2_path;
+  ret_val.first  = delay;
+  ret_val.second = input_pair1.first;
+  delay = drv1->delay_nand3_path + blk1->delay_nand3_path;
+  if (ret_val.first < delay)
+  {
+    ret_val.first  = delay;
+    ret_val.second = input_pair1.second;
+  }
+  delay = drv2->delay_nand2_path + blk2->delay_nand2_path;
+  if (ret_val.first < delay)
+  {
+    ret_val.first  = delay;
+    ret_val.second = input_pair2.first;
+  }
+  delay = drv2->delay_nand3_path + blk2->delay_nand3_path;
+  if (ret_val.first < delay)
+  {
+    ret_val.first  = delay;
+    ret_val.second = input_pair2.second;
+  }
+
+  return ret_val;
+}
+
+
+
+Driver::Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram)
+:number_gates(0),
+  min_number_gates(2),
+  c_gate_load(c_gate_load_),
+  c_wire_load(c_wire_load_),
+  r_wire_load(r_wire_load_),
+  delay(0),
+  power(),
+  is_dram_(is_dram)
+{
+  for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
+  {
+    width_n[i] = 0;
+    width_p[i] = 0;
+  }
+
+  compute_widths();
+}
+
+
+void Driver::compute_widths()
+{
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_);
+  double c_load = c_gate_load + c_wire_load;
+  width_n[0] = g_tp.min_w_nmos_;
+  width_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+
+  double F = c_load / gate_C(width_n[0] + width_p[0], 0, is_dram_);
+  number_gates = logical_effort(
+      min_number_gates,
+      1,
+      F,
+      width_n,
+      width_p,
+      c_load,
+      p_to_n_sz_ratio,
+      is_dram_, false,
+      g_tp.max_w_nmos_);
+}
+
+
+
+double Driver::compute_delay(double inrisetime)
+{
+  int    i;
+  double rd, c_load, c_intrinsic, tf;
+  double this_delay = 0;
+
+  for (i = 0; i < number_gates - 1; ++i)
+  {
+    rd = tr_R_on(width_n[i], NCH, 1, is_dram_);
+    c_load = gate_C(width_n[i+1] + width_p[i+1], 0.0, is_dram_);
+    c_intrinsic = drain_C_(width_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                  drain_C_(width_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+    tf = rd * (c_intrinsic + c_load);
+    this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+    delay += this_delay;
+    inrisetime = this_delay / (1.0 - 0.5);
+    power.readOp.dynamic += (c_intrinsic + c_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+    power.readOp.leakage += cmos_Isub_leakage(width_n[i], width_p[i], 1, inv, is_dram_) *g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage += cmos_Ig_leakage(width_n[i], width_p[i], 1, inv, is_dram_)* g_tp.peri_global.Vdd;
+  }
+
+  i = number_gates - 1;
+  c_load = c_gate_load + c_wire_load;
+  rd = tr_R_on(width_n[i], NCH, 1, is_dram_);
+  c_intrinsic = drain_C_(width_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                drain_C_(width_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+  tf = rd * (c_intrinsic + c_load) + r_wire_load * (c_wire_load / 2 + c_gate_load);
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay += this_delay;
+  power.readOp.dynamic += (c_intrinsic + c_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power.readOp.leakage += cmos_Isub_leakage(width_n[i], width_p[i], 1, inv, is_dram_) * g_tp.peri_global.Vdd;
+  power.readOp.gate_leakage += cmos_Ig_leakage(width_n[i], width_p[i], 1, inv, is_dram_)* g_tp.peri_global.Vdd;
+
+  return this_delay / (1.0 - 0.5);
+}
+
diff --git a/ext/mcpat/cacti/decoder.h b/ext/mcpat/cacti/decoder.h
new file mode 100644 (file)
index 0000000..35631e8
--- /dev/null
@@ -0,0 +1,247 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __DECODER_H__
+#define __DECODER_H__
+
+#include <vector>
+
+#include "area.h"
+#include "component.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+class Decoder : public Component
+{
+  public:
+    Decoder(
+        int _num_dec_signals,
+        bool flag_way_select,
+        double _C_ld_dec_out,
+        double _R_wire_dec_out,
+        bool fully_assoc_,
+        bool is_dram_,
+        bool is_wl_tr_,
+        const Area & cell_);
+
+    bool   exist;
+    int    num_in_signals;
+    double C_ld_dec_out;
+    double R_wire_dec_out;
+    int    num_gates;
+    int    num_gates_min;
+    double w_dec_n[MAX_NUMBER_GATES_STAGE];
+    double w_dec_p[MAX_NUMBER_GATES_STAGE];
+    double delay;
+    //powerDef power;
+    bool   fully_assoc;
+    bool   is_dram;
+    bool   is_wl_tr;
+    const  Area & cell;
+
+
+    void   compute_widths();
+    void   compute_area();
+    double compute_delays(double inrisetime);  // return outrisetime
+
+    void leakage_feedback(double temperature);
+};
+
+
+
+class PredecBlk : public Component
+{
+ public:
+  PredecBlk(
+      int num_dec_signals,
+      Decoder * dec,
+      double C_wire_predec_blk_out,
+      double R_wire_predec_blk_out,
+      int    num_dec_per_predec,
+      bool   is_dram_,
+      bool   is_blk1);
+
+  Decoder * dec;
+  bool exist;
+  int number_input_addr_bits;
+  double C_ld_predec_blk_out;
+  double R_wire_predec_blk_out;
+  int branch_effort_nand2_gate_output;
+  int branch_effort_nand3_gate_output;
+  bool   flag_two_unique_paths;
+  int flag_L2_gate;
+  int number_inputs_L1_gate;
+  int number_gates_L1_nand2_path;
+  int number_gates_L1_nand3_path;
+  int number_gates_L2;
+  int min_number_gates_L1;
+  int min_number_gates_L2;
+  int num_L1_active_nand2_path;
+  int num_L1_active_nand3_path;
+  double w_L1_nand2_n[MAX_NUMBER_GATES_STAGE];
+  double w_L1_nand2_p[MAX_NUMBER_GATES_STAGE];
+  double w_L1_nand3_n[MAX_NUMBER_GATES_STAGE];
+  double w_L1_nand3_p[MAX_NUMBER_GATES_STAGE];
+  double w_L2_n[MAX_NUMBER_GATES_STAGE];
+  double w_L2_p[MAX_NUMBER_GATES_STAGE];
+  double delay_nand2_path;
+  double delay_nand3_path;
+  powerDef power_nand2_path;
+  powerDef power_nand3_path;
+  powerDef power_L2;
+
+  bool is_dram_;
+
+  void compute_widths();
+  void compute_area();
+
+  void leakage_feedback(double temperature);
+
+  pair<double, double> compute_delays(pair<double, double> inrisetime); // <nand2, nand3>
+  // return <outrise_nand2, outrise_nand3>
+};
+
+
+class PredecBlkDrv : public Component
+{
+ public:
+  PredecBlkDrv(
+      int   way_select,
+      PredecBlk * blk_,
+      bool  is_dram);
+
+  int flag_driver_exists;
+  int number_input_addr_bits;
+  int number_gates_nand2_path;
+  int number_gates_nand3_path;
+  int min_number_gates;
+  int num_buffers_driving_1_nand2_load;
+  int num_buffers_driving_2_nand2_load;
+  int num_buffers_driving_4_nand2_load;
+  int num_buffers_driving_2_nand3_load;
+  int num_buffers_driving_8_nand3_load;
+  int num_buffers_nand3_path;
+  double c_load_nand2_path_out;
+  double c_load_nand3_path_out;
+  double r_load_nand2_path_out;
+  double r_load_nand3_path_out;
+  double width_nand2_path_n[MAX_NUMBER_GATES_STAGE];
+  double width_nand2_path_p[MAX_NUMBER_GATES_STAGE];
+  double width_nand3_path_n[MAX_NUMBER_GATES_STAGE];
+  double width_nand3_path_p[MAX_NUMBER_GATES_STAGE];
+  double delay_nand2_path;
+  double delay_nand3_path;
+  powerDef power_nand2_path;
+  powerDef power_nand3_path;
+
+  PredecBlk * blk;
+  Decoder   * dec;
+  bool  is_dram_;
+  int   way_select;
+
+  void compute_widths();
+  void compute_area();
+
+  void leakage_feedback(double temperature);
+
+
+  pair<double, double> compute_delays(
+      double inrisetime_nand2_path,
+      double inrisetime_nand3_path);  // return <outrise_nand2, outrise_nand3>
+
+  inline int num_addr_bits_nand2_path()
+  {
+    return num_buffers_driving_1_nand2_load +
+           num_buffers_driving_2_nand2_load +
+           num_buffers_driving_4_nand2_load;
+  }
+  inline int num_addr_bits_nand3_path()
+  {
+    return num_buffers_driving_2_nand3_load +
+           num_buffers_driving_8_nand3_load;
+  }
+  double get_rdOp_dynamic_E(int num_act_mats_hor_dir);
+};
+
+
+
+class Predec : public Component
+{
+  public:
+    Predec(
+        PredecBlkDrv * drv1,
+        PredecBlkDrv * drv2);
+
+    double compute_delays(double inrisetime);  // return outrisetime
+
+    void leakage_feedback(double temperature);
+    PredecBlk    * blk1;
+    PredecBlk    * blk2;
+    PredecBlkDrv * drv1;
+    PredecBlkDrv * drv2;
+
+    powerDef block_power;
+    powerDef driver_power;
+
+  private:
+    // returns <delay, risetime>
+    pair<double, double> get_max_delay_before_decoder(
+        pair<double, double> input_pair1,
+        pair<double, double> input_pair2);
+};
+
+
+
+class Driver : public Component
+{
+ public:
+  Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram);
+
+  int    number_gates;
+  int    min_number_gates;
+  double width_n[MAX_NUMBER_GATES_STAGE];
+  double width_p[MAX_NUMBER_GATES_STAGE];
+  double c_gate_load;
+  double c_wire_load;
+  double r_wire_load;
+  double delay;
+  powerDef power;
+  bool   is_dram_;
+
+  void   compute_widths();
+  double compute_delay(double inrisetime);
+};
+
+
+#endif
diff --git a/ext/mcpat/cacti/htree2.cc b/ext/mcpat/cacti/htree2.cc
new file mode 100644 (file)
index 0000000..817ea6a
--- /dev/null
@@ -0,0 +1,641 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <cassert>
+#include <iostream>
+
+#include "htree2.h"
+#include "wire.h"
+
+Htree2::Htree2(
+    enum Wire_type wire_model, double mat_w, double mat_h,
+    int a_bits, int d_inbits, int search_data_in, int d_outbits, int search_data_out, int bl, int wl, enum Htree_type htree_type,
+    bool uca_tree_, bool search_tree_, TechnologyParameter::DeviceType *dt)
+ :in_rise_time(0), out_rise_time(0),
+  tree_type(htree_type), mat_width(mat_w), mat_height(mat_h),
+  add_bits(a_bits), data_in_bits(d_inbits), search_data_in_bits(search_data_in),data_out_bits(d_outbits),
+  search_data_out_bits(search_data_out), ndbl(bl), ndwl(wl),
+  uca_tree(uca_tree_), search_tree(search_tree_), wt(wire_model), deviceType(dt)
+{
+  assert(ndbl >= 2 && ndwl >= 2);
+
+//  if (ndbl == 1 && ndwl == 1)
+//  {
+//    delay = 0;
+//    power.readOp.dynamic = 0;
+//    power.readOp.leakage = 0;
+//    area.w = mat_w;
+//    area.h = mat_h;
+//    return;
+//  }
+//  if (ndwl == 1) ndwl++;
+//  if (ndbl == 1) ndbl++;
+
+  max_unpipelined_link_delay = 0; //TODO
+  min_w_nmos = g_tp.min_w_nmos_;
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
+
+  switch (htree_type)
+  {
+    case Add_htree:
+      wire_bw = init_wire_bw = add_bits;
+      in_htree();
+      break;
+    case Data_in_htree:
+      wire_bw = init_wire_bw = data_in_bits;
+      in_htree();
+      break;
+    case Data_out_htree:
+      wire_bw = init_wire_bw = data_out_bits;
+      out_htree();
+      break;
+    case Search_in_htree:
+      wire_bw = init_wire_bw = search_data_in_bits;//in_search_tree is broad cast, out_htree is not.
+      in_htree();
+      break;
+    case Search_out_htree:
+      wire_bw = init_wire_bw = search_data_out_bits;
+      out_htree();
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  power_bit = power;
+  power.readOp.dynamic *= init_wire_bw;
+
+  assert(power.readOp.dynamic >= 0);
+  assert(power.readOp.leakage >= 0);
+}
+
+
+
+// nand gate sizing calculation
+void Htree2::input_nand(double s1, double s2, double l_eff)
+{
+  Wire w1(wt, l_eff);
+  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
+  // input capacitance of a repeater  = input capacitance of nand.
+  double nsize = s1*(1 + pton_size)/(2 + pton_size);
+  nsize = (nsize < 1) ? 1 : nsize;
+
+  double tc = 2*tr_R_on(nsize*min_w_nmos, NCH, 1) *
+    (drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
+     2 * gate_C(s2*(min_w_nmos + min_w_pmos), 0));
+  delay+= horowitz (w1.out_rise_time, tc,
+      deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
+  power.readOp.dynamic += 0.5 *
+    (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd * wire_bw ;
+  power.readOp.leakage += (wire_bw*cmos_Isub_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
+  power.readOp.gate_leakage += (wire_bw*cmos_Ig_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
+}
+
+
+
+// tristate buffer model consisting of not, nand, nor, and driver transistors
+void Htree2::output_buffer(double s1, double s2, double l_eff)
+{
+  Wire w1(wt, l_eff);
+  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
+  // input capacitance of repeater = input capacitance of nand + nor.
+  double size = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
+  double s_eff =  //stage eff of a repeater in a wire
+    (gate_C(s2*(min_w_nmos + min_w_pmos), 0) + w1.wire_cap(l_eff*1e-6,true))/
+    gate_C(s2*(min_w_nmos + min_w_pmos), 0);
+  double tr_size = gate_C(s1*(min_w_nmos + min_w_pmos), 0) * 1/2/(s_eff*gate_C(min_w_pmos, 0));
+  size = (size < 1) ? 1 : size;
+
+  double res_nor = 2*tr_R_on(size*min_w_pmos, PCH, 1);
+  double res_ptrans = tr_R_on(tr_size*min_w_nmos, NCH, 1);
+  double cap_nand_out = drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
+                        drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
+                        gate_C(tr_size*min_w_pmos, 0);
+  double cap_ptrans_out = 2 *(drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+                              drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)) +
+                          gate_C(s1*(min_w_nmos + min_w_pmos), 0);
+
+  double tc = res_nor * cap_nand_out + (res_nor + res_ptrans) * cap_ptrans_out;
+
+
+  delay += horowitz (w1.out_rise_time, tc,
+      deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
+
+  //nand
+  power.readOp.dynamic += 0.5 *
+    (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+       drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
+     gate_C(tr_size*(min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+       drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
+     gate_C(tr_size*(min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  //not
+  power.readOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  //nor
+  power.readOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  //output transistor
+  power.readOp.dynamic += 0.5 *
+    ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+      +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
+     + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+      +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
+     + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  if(uca_tree) {
+        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+
+        power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+    //power.readOp.gate_leakage *=;
+  }
+  else {
+        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+
+        power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+    //power.readOp.gate_leakage *=deviceType->Vdd*wire_bw;
+  }
+}
+
+
+
+/* calculates the input h-tree delay/power
+ * A nand gate is used at each node to
+ * limit the signal
+ * The area of an unbalanced htree (rows != columns)
+ * depends on how data is traversed.
+ * In the following function, if ( no. of rows < no. of columns),
+ * then data first traverse in excess hor. links until vertical
+ * and horizontal nodes are same.
+ * If no. of rows is bigger, then data traverse in
+ * a hor. link followed by a ver. link in a repeated
+ * fashion (similar to a balanced tree) until there are no
+ * hor. links left. After this it goes through the remaining vertical
+ * links.
+ */
+  void
+Htree2::in_htree()
+{
+  //temp var
+  double s1 = 0, s2 = 0, s3 = 0;
+  double l_eff = 0;
+  Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
+  double len = 0, ht = 0;
+  int option = 0;
+
+  int h = (int) _log2(ndwl/2); // horizontal nodes
+  int v = (int) _log2(ndbl/2); // vertical nodes
+  double len_temp;
+  double ht_temp;
+  if (uca_tree)
+  {//Sheng: this computation do not consider the wires that route from edge to middle.
+    ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,h))))/2;
+    len_temp = (mat_width*ndwl/2 +
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,v))))/2;
+  }
+  else
+  {
+    if (ndwl == ndbl) {
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+    }
+    else if (ndwl > ndbl) {
+      double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
+          (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+    }
+    else {
+       double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+          ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
+    }
+  }
+
+  area.h   = ht_temp * 2;
+  area.w   = len_temp * 2;
+  delay = 0;
+  power.readOp.dynamic = 0;
+  power.readOp.leakage = 0;
+  power.searchOp.dynamic =0;
+  len = len_temp;
+  ht  = ht_temp/2;
+
+  while (v > 0 || h > 0)
+  {
+    if (wtemp1) delete wtemp1;
+    if (wtemp2) delete wtemp2;
+    if (wtemp3) delete wtemp3;
+
+    if (h > v)
+    {
+      //the iteration considers only one horizontal link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, len/2);  // ver
+      len_temp = len;
+      len /= 2;
+      wtemp3 = 0;
+      h--;
+      option = 0;
+    }
+    else if (v>0 && h>0)
+    {
+      //considers one horizontal link and one vertical link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, ht);  // ver
+      wtemp3 = new Wire(wt, len/2);  // next hor
+      len_temp = len;
+      ht_temp = ht;
+      len /= 2;
+      ht  /= 2;
+      v--;
+      h--;
+      option = 1;
+    }
+    else
+    {
+      // considers only one vertical link
+      assert(h == 0);
+      wtemp1 = new Wire(wt, ht); // ver
+      wtemp2 = new Wire(wt, ht/2);  // hor
+      ht_temp = ht;
+      ht /= 2;
+      wtemp3 = 0;
+      v--;
+      option = 2;
+    }
+
+    delay += wtemp1->delay;
+    power.readOp.dynamic += wtemp1->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp1->power.readOp.dynamic*wire_bw;
+    power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
+    if ((uca_tree == false && option == 2) || search_tree==true)
+    {
+      wire_bw*=2;  // wire bandwidth doubles only for vertical branches
+    }
+
+    if (uca_tree == false)
+    {
+      if (len_temp > wtemp1->repeater_spacing)
+      {
+        s1 = wtemp1->repeater_size;
+        l_eff = wtemp1->repeater_spacing;
+      }
+      else
+      {
+        s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
+        l_eff = len_temp;
+      }
+
+      if (ht_temp > wtemp2->repeater_spacing)
+      {
+        s2 = wtemp2->repeater_size;
+      }
+      else
+      {
+        s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
+      }
+      // first level
+      input_nand(s1, s2, l_eff);
+    }
+
+
+    if (option != 1)
+    {
+      continue;
+    }
+
+    // second level
+    delay += wtemp2->delay;
+    power.readOp.dynamic += wtemp2->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp2->power.readOp.dynamic*wire_bw;
+    power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+
+    if (uca_tree)
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+    }
+    else
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+      wire_bw*=2;
+
+      if (ht_temp > wtemp3->repeater_spacing)
+      {
+        s3    = wtemp3->repeater_size;
+        l_eff = wtemp3->repeater_spacing;
+      }
+      else
+      {
+        s3    = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
+        l_eff = ht_temp;
+      }
+
+      input_nand(s2, s3, l_eff);
+    }
+  }
+
+  if (wtemp1) delete wtemp1;
+  if (wtemp2) delete wtemp2;
+  if (wtemp3) delete wtemp3;
+}
+
+
+
+/* a tristate buffer is used to handle fan-ins
+ * The area of an unbalanced htree (rows != columns)
+ * depends on how data is traversed.
+ * In the following function, if ( no. of rows < no. of columns),
+ * then data first traverse in excess hor. links until vertical
+ * and horizontal nodes are same.
+ * If no. of rows is bigger, then data traverse in
+ * a hor. link followed by a ver. link in a repeated
+ * fashion (similar to a balanced tree) until there are no
+ * hor. links left. After this it goes through the remaining vertical
+ * links.
+ */
+void Htree2::out_htree()
+{
+  //temp var
+  double s1 = 0, s2 = 0, s3 = 0;
+  double l_eff = 0;
+  Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
+  double len = 0, ht = 0;
+  int option = 0;
+
+  int h = (int) _log2(ndwl/2);
+  int v = (int) _log2(ndbl/2);
+  double len_temp;
+  double ht_temp;
+  if (uca_tree)
+  {
+    ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,h))))/2;
+    len_temp = (mat_width*ndwl/2 +
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,v))))/2;
+  }
+  else
+    {
+    if (ndwl == ndbl) {
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits+ (search_data_in_bits + search_data_out_bits)) * (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+
+    }
+    else if (ndwl > ndbl) {
+      double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
+          (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+    }
+    else {
+      double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
+    }
+  }
+  area.h = ht_temp * 2;
+  area.w = len_temp * 2;
+  delay = 0;
+  power.readOp.dynamic = 0;
+  power.readOp.leakage = 0;
+  power.readOp.gate_leakage = 0;
+  //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+  len = len_temp;
+  ht = ht_temp/2;
+
+  while (v > 0 || h > 0)
+  { //finds delay/power of each link in the tree
+    if (wtemp1) delete wtemp1;
+    if (wtemp2) delete wtemp2;
+    if (wtemp3) delete wtemp3;
+
+    if(h > v) {
+      //the iteration considers only one horizontal link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, len/2);  // ver
+      len_temp = len;
+      len /= 2;
+      wtemp3 = 0;
+      h--;
+      option = 0;
+    }
+    else if (v>0 && h>0) {
+      //considers one horizontal link and one vertical link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, ht);  // ver
+      wtemp3 = new Wire(wt, len/2);  // next hor
+      len_temp = len;
+      ht_temp = ht;
+      len /= 2;
+      ht /= 2;
+      v--;
+      h--;
+      option = 1;
+    }
+    else {
+      // considers only one vertical link
+      assert(h == 0);
+      wtemp1 = new Wire(wt, ht); // hor
+      wtemp2 = new Wire(wt, ht/2);  // ver
+      ht_temp = ht;
+      ht /= 2;
+      wtemp3 = 0;
+      v--;
+      option = 2;
+    }
+    delay += wtemp1->delay;
+    power.readOp.dynamic += wtemp1->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp1->power.readOp.dynamic*init_wire_bw;
+    power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
+    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+    if ((uca_tree == false && option == 2) || search_tree==true)
+    {
+      wire_bw*=2;
+    }
+
+    if (uca_tree == false)
+    {
+      if (len_temp > wtemp1->repeater_spacing)
+      {
+        s1 = wtemp1->repeater_size;
+        l_eff = wtemp1->repeater_spacing;
+      }
+      else
+      {
+        s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
+        l_eff = len_temp;
+      }
+      if (ht_temp > wtemp2->repeater_spacing)
+      {
+        s2 = wtemp2->repeater_size;
+      }
+      else
+      {
+        s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
+      }
+      // first level
+      output_buffer(s1, s2, l_eff);
+    }
+
+
+    if (option != 1)
+    {
+      continue;
+    }
+
+    // second level
+    delay += wtemp2->delay;
+    power.readOp.dynamic += wtemp2->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp2->power.readOp.dynamic*init_wire_bw;
+    power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+    if (uca_tree)
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+    }
+    else
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+      wire_bw*=2;
+
+      if (ht_temp > wtemp3->repeater_spacing)
+      {
+        s3 = wtemp3->repeater_size;
+        l_eff = wtemp3->repeater_spacing;
+      }
+      else
+      {
+        s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
+        l_eff = ht_temp;
+      }
+
+      output_buffer(s2, s3, l_eff);
+    }
+    //cout<<"power.readOp.leakage"<<power.readOp.leakage<<endl;
+    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+    //cout<<"wtemp2->power.readOp.gate_leakage"<<wtemp2->power.readOp.gate_leakage<<endl;
+  }
+
+  if (wtemp1) delete wtemp1;
+  if (wtemp2) delete wtemp2;
+  if (wtemp3) delete wtemp3;
+}
+
diff --git a/ext/mcpat/cacti/htree2.h b/ext/mcpat/cacti/htree2.h
new file mode 100644 (file)
index 0000000..053e43a
--- /dev/null
@@ -0,0 +1,97 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __HTREE2_H__
+#define __HTREE2_H__
+
+#include "assert.h"
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "parameter.h"
+#include "subarray.h"
+#include "wire.h"
+
+// leakge power includes entire htree in a bank (when uca_tree == false)
+// leakge power includes only part to one bank when uca_tree == true
+
+class Htree2 : public Component
+{
+  public:
+    Htree2(enum Wire_type wire_model,
+        double mat_w, double mat_h, int add, int data_in, int search_data_in, int data_out, int search_data_out, int bl, int wl,
+        enum Htree_type h_type, bool uca_tree_ = false, bool search_tree_ = false,
+        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Htree2() {};
+
+    void in_htree();
+    void out_htree();
+
+    // repeaters only at h-tree nodes
+    void limited_in_htree();
+    void limited_out_htree();
+    void input_nand(double s1, double s2, double l);
+    void output_buffer(double s1, double s2, double l);
+
+    double in_rise_time, out_rise_time;
+
+    void set_in_rise_time(double rt)
+    {
+      in_rise_time = rt;
+    }
+
+    double max_unpipelined_link_delay;
+    powerDef power_bit;
+
+
+  private:
+    double wire_bw;
+    double init_wire_bw;  // bus width at root
+    enum Htree_type tree_type;
+    double htree_hnodes;
+    double htree_vnodes;
+    double mat_width;
+    double mat_height;
+    int add_bits, data_in_bits,search_data_in_bits,data_out_bits,  search_data_out_bits;
+    int ndbl, ndwl;
+    bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously
+    bool search_tree;
+
+    enum Wire_type wt;
+    double min_w_nmos;
+    double min_w_pmos;
+
+    TechnologyParameter::DeviceType *deviceType;
+
+};
+
+#endif
diff --git a/ext/mcpat/cacti/io.cc b/ext/mcpat/cacti/io.cc
new file mode 100644 (file)
index 0000000..56725ab
--- /dev/null
@@ -0,0 +1,2350 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "Ucache.h"
+#include "arbiter.h"
+#include "area.h"
+#include "basic_circuit.h"
+#include "crossbar.h"
+#include "io.h"
+#include "nuca.h"
+#include "parameter.h"
+//#include "highradix.h"
+
+using namespace std;
+
+
+/* Parses "cache.cfg" file */
+  void
+InputParameter::parse_cfg(const string & in_file)
+{
+  FILE *fp = fopen(in_file.c_str(), "r");
+  char line[5000];
+  char jk[5000];
+  char temp_var[5000];
+
+  if(!fp) {
+    cout << in_file << " is missing!\n";
+    exit(-1);
+  }
+
+  while(fscanf(fp, "%[^\n]\n", line) != EOF) {
+
+    if (!strncmp("-size", line, strlen("-size"))) {
+      sscanf(line, "-size %[(:-~)*]%u", jk, &(cache_sz));
+      continue;
+    }
+
+    if (!strncmp("-page size", line, strlen("-page size"))) {
+      sscanf(line, "-page size %[(:-~)*]%u", jk, &(page_sz_bits));
+      continue;
+    }
+
+    if (!strncmp("-burst length", line, strlen("-burst length"))) {
+      sscanf(line, "-burst %[(:-~)*]%u", jk, &(burst_len));
+      continue;
+    }
+
+    if (!strncmp("-internal prefetch width", line, strlen("-internal prefetch width"))) {
+      sscanf(line, "-internal prefetch %[(:-~)*]%u", jk, &(int_prefetch_w));
+      continue;
+    }
+
+    if (!strncmp("-block", line, strlen("-block"))) {
+      sscanf(line, "-block size (bytes) %d", &(line_sz));
+      continue;
+    }
+
+    if (!strncmp("-associativity", line, strlen("-associativity"))) {
+      sscanf(line, "-associativity %d", &(assoc));
+      continue;
+    }
+
+    if (!strncmp("-read-write", line, strlen("-read-write"))) {
+      sscanf(line, "-read-write port %d", &(num_rw_ports));
+      continue;
+    }
+
+    if (!strncmp("-exclusive read", line, strlen("exclusive read"))) {
+      sscanf(line, "-exclusive read port %d", &(num_rd_ports));
+      continue;
+    }
+
+    if(!strncmp("-exclusive write", line, strlen("-exclusive write"))) {
+      sscanf(line, "-exclusive write port %d", &(num_wr_ports));
+      continue;
+    }
+
+    if (!strncmp("-single ended", line, strlen("-single ended"))) {
+      sscanf(line, "-single %[(:-~)*]%d", jk,
+          &(num_se_rd_ports));
+      continue;
+    }
+
+    if (!strncmp("-search", line, strlen("-search"))) {
+      sscanf(line, "-search port %d", &(num_search_ports));
+      continue;
+    }
+
+    if (!strncmp("-UCA bank", line, strlen("-UCA bank"))) {
+      sscanf(line, "-UCA bank%[((:-~)| )*]%d", jk, &(nbanks));
+      continue;
+    }
+
+    if (!strncmp("-technology", line, strlen("-technology"))) {
+      sscanf(line, "-technology (u) %lf", &(F_sz_um));
+      F_sz_nm = F_sz_um*1000;
+      continue;
+    }
+
+    if (!strncmp("-output/input", line, strlen("-output/input"))) {
+      sscanf(line, "-output/input bus %[(:-~)*]%d", jk, &(out_w));
+      continue;
+    }
+
+    if (!strncmp("-operating temperature", line, strlen("-operating temperature"))) {
+      sscanf(line, "-operating temperature %[(:-~)*]%d", jk, &(temp));
+      continue;
+    }
+
+    if (!strncmp("-cache type", line, strlen("-cache type"))) {
+      sscanf(line, "-cache type%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("cache", temp_var, sizeof("cache"))) {
+        is_cache = true;
+      }
+      else
+      {
+        is_cache = false;
+      }
+
+      if (!strncmp("main memory", temp_var, sizeof("main memory"))) {
+        is_main_mem = true;
+      }
+      else {
+        is_main_mem = false;
+      }
+
+      if (!strncmp("cam", temp_var, sizeof("cam"))) {
+        pure_cam = true;
+      }
+      else {
+        pure_cam = false;
+      }
+
+      if (!strncmp("ram", temp_var, sizeof("ram"))) {
+        pure_ram = true;
+      }
+      else {
+          if (!is_main_mem)
+                  pure_ram = false;
+          else
+                  pure_ram = true;
+      }
+
+      continue;
+    }
+
+
+    if (!strncmp("-tag size", line, strlen("-tag size"))) {
+      sscanf(line, "-tag size%[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("default", temp_var, sizeof("default"))) {
+        specific_tag = false;
+        tag_w = 42; /* the acutal value is calculated
+                     * later based on the cache size, bank count, and associativity
+                     */
+      }
+      else {
+        specific_tag = true;
+        sscanf(line, "-tag size (b) %d", &(tag_w));
+      }
+      continue;
+    }
+
+    if (!strncmp("-access mode", line, strlen("-access mode"))) {
+      sscanf(line, "-access %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("fast", temp_var, strlen("fast"))) {
+        access_mode = 2;
+      }
+      else if (!strncmp("sequential", temp_var, strlen("sequential"))) {
+        access_mode = 1;
+      }
+      else if(!strncmp("normal", temp_var, strlen("normal"))) {
+        access_mode = 0;
+      }
+      else {
+        cout << "ERROR: Invalid access mode!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Data array cell type", line, strlen("-Data array cell type"))) {
+      sscanf(line, "-Data array cell type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        data_arr_ram_cell_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        data_arr_ram_cell_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        data_arr_ram_cell_tech_type = 2;
+      }
+      else if(!strncmp("lp-dram", temp_var, strlen("lp-dram"))) {
+        data_arr_ram_cell_tech_type = 3;
+      }
+      else if(!strncmp("comm-dram", temp_var, strlen("comm-dram"))) {
+        data_arr_ram_cell_tech_type = 4;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Data array peripheral type", line, strlen("-Data array peripheral type"))) {
+      sscanf(line, "-Data array peripheral type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        data_arr_peri_global_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        data_arr_peri_global_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        data_arr_peri_global_tech_type = 2;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Tag array cell type", line, strlen("-Tag array cell type"))) {
+      sscanf(line, "-Tag array cell type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        tag_arr_ram_cell_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        tag_arr_ram_cell_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        tag_arr_ram_cell_tech_type = 2;
+      }
+      else if(!strncmp("lp-dram", temp_var, strlen("lp-dram"))) {
+        tag_arr_ram_cell_tech_type = 3;
+      }
+      else if(!strncmp("comm-dram", temp_var, strlen("comm-dram"))) {
+        tag_arr_ram_cell_tech_type = 4;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Tag array peripheral type", line, strlen("-Tag array peripheral type"))) {
+      sscanf(line, "-Tag array peripheral type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        tag_arr_peri_global_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        tag_arr_peri_global_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        tag_arr_peri_global_tech_type = 2;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+    if(!strncmp("-design", line, strlen("-design"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_wt), &(dynamic_power_wt),
+          &(leakage_power_wt),
+          &(cycle_time_wt), &(area_wt));
+      continue;
+    }
+
+    if(!strncmp("-deviate", line, strlen("-deviate"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_dev), &(dynamic_power_dev),
+          &(leakage_power_dev),
+          &(cycle_time_dev), &(area_dev));
+      continue;
+    }
+
+    if(!strncmp("-Optimize", line, strlen("-Optimize"))) {
+      sscanf(line, "-Optimize  %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("ED^2", temp_var, strlen("ED^2"))) {
+        ed = 2;
+      }
+      else if(!strncmp("ED", temp_var, strlen("ED"))) {
+        ed = 1;
+      }
+      else {
+        ed = 0;
+      }
+    }
+
+    if(!strncmp("-NUCAdesign", line, strlen("-NUCAdesign"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_wt_nuca), &(dynamic_power_wt_nuca),
+          &(leakage_power_wt_nuca),
+          &(cycle_time_wt_nuca), &(area_wt_nuca));
+      continue;
+    }
+
+    if(!strncmp("-NUCAdeviate", line, strlen("-NUCAdeviate"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_dev_nuca), &(dynamic_power_dev_nuca),
+          &(leakage_power_dev_nuca),
+          &(cycle_time_dev_nuca), &(area_dev_nuca));
+      continue;
+    }
+
+    if(!strncmp("-Cache model", line, strlen("-cache model"))) {
+      sscanf(line, "-Cache model %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("UCA", temp_var, strlen("UCA"))) {
+        nuca = 0;
+      }
+      else {
+        nuca = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-NUCA bank", line, strlen("-NUCA bank"))) {
+      sscanf(line, "-NUCA bank count %d", &(nuca_bank_count));
+
+      if (nuca_bank_count != 0) {
+        force_nuca_bank = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-Wire inside mat", line, strlen("-Wire inside mat"))) {
+      sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("global", temp_var, strlen("global"))) {
+        wire_is_mat_type = 2;
+        continue;
+      }
+      else if (!strncmp("local", temp_var, strlen("local"))) {
+        wire_is_mat_type = 0;
+        continue;
+      }
+      else {
+        wire_is_mat_type = 1;
+        continue;
+      }
+    }
+
+    if(!strncmp("-Wire outside mat", line, strlen("-Wire outside mat"))) {
+      sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("global", temp_var, strlen("global"))) {
+        wire_os_mat_type = 2;
+      }
+      else {
+        wire_os_mat_type = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-Interconnect projection", line, strlen("-Interconnect projection"))) {
+      sscanf(line, "-Interconnect projection%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("aggressive", temp_var, strlen("aggressive"))) {
+        ic_proj_type = 0;
+      }
+      else {
+        ic_proj_type = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-Wire signalling", line, strlen("-wire signalling"))) {
+      sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("default", temp_var, strlen("default"))) {
+        force_wiretype = 0;
+        wt = Global;
+      }
+      else if (!(strncmp("Global_10", temp_var, strlen("Global_10")))) {
+        force_wiretype = 1;
+        wt = Global_10;
+      }
+      else if (!(strncmp("Global_20", temp_var, strlen("Global_20")))) {
+        force_wiretype = 1;
+        wt = Global_20;
+      }
+      else if (!(strncmp("Global_30", temp_var, strlen("Global_30")))) {
+        force_wiretype = 1;
+        wt = Global_30;
+      }
+      else if (!(strncmp("Global_5", temp_var, strlen("Global_5")))) {
+        force_wiretype = 1;
+        wt = Global_5;
+      }
+      else if (!(strncmp("Global", temp_var, strlen("Global")))) {
+        force_wiretype = 1;
+        wt = Global;
+      }
+      else {
+        wt = Low_swing;
+        force_wiretype = 1;
+      }
+      continue;
+    }
+
+
+
+    if(!strncmp("-Core", line, strlen("-Core"))) {
+      sscanf(line, "-Core count %d\n", &(cores));
+      if (cores > 16) {
+        printf("No. of cores should be less than 16!\n");
+      }
+      continue;
+    }
+
+    if(!strncmp("-Cache level", line, strlen("-Cache level"))) {
+      sscanf(line, "-Cache l%[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("L2", temp_var, strlen("L2"))) {
+        cache_level = 0;
+      }
+      else {
+        cache_level = 1;
+      }
+    }
+
+    if(!strncmp("-Print level", line, strlen("-Print level"))) {
+      sscanf(line, "-Print l%[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("DETAILED", temp_var, strlen("DETAILED"))) {
+        print_detail = 1;
+      }
+      else {
+        print_detail = 0;
+      }
+
+    }
+    if(!strncmp("-Add ECC", line, strlen("-Add ECC"))) {
+      sscanf(line, "-Add ECC %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("true", temp_var, strlen("true"))) {
+        add_ecc_b_ = true;
+      }
+      else {
+        add_ecc_b_ = false;
+      }
+    }
+
+    if(!strncmp("-Print input parameters", line, strlen("-Print input parameters"))) {
+      sscanf(line, "-Print input %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("true", temp_var, strlen("true"))) {
+        print_input_args = true;
+      }
+      else {
+        print_input_args = false;
+      }
+    }
+
+    if(!strncmp("-Force cache config", line, strlen("-Force cache config"))) {
+      sscanf(line, "-Force cache %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("true", temp_var, strlen("true"))) {
+        force_cache_config = true;
+      }
+      else {
+        force_cache_config = false;
+      }
+    }
+
+    if(!strncmp("-Ndbl", line, strlen("-Ndbl"))) {
+      sscanf(line, "-Ndbl %d\n", &(ndbl));
+      continue;
+    }
+    if(!strncmp("-Ndwl", line, strlen("-Ndwl"))) {
+      sscanf(line, "-Ndwl %d\n", &(ndwl));
+      continue;
+    }
+    if(!strncmp("-Nspd", line, strlen("-Nspd"))) {
+      sscanf(line, "-Nspd %d\n", &(nspd));
+      continue;
+    }
+    if(!strncmp("-Ndsam1", line, strlen("-Ndsam1"))) {
+      sscanf(line, "-Ndsam1 %d\n", &(ndsam1));
+      continue;
+    }
+    if(!strncmp("-Ndsam2", line, strlen("-Ndsam2"))) {
+      sscanf(line, "-Ndsam2 %d\n", &(ndsam2));
+      continue;
+    }
+   if(!strncmp("-Ndcm", line, strlen("-Ndcm"))) {
+      sscanf(line, "-Ndcm %d\n", &(ndcm));
+      continue;
+    }
+
+  }
+  rpters_in_htree = true;
+  fclose(fp);
+}
+
+  void
+InputParameter::display_ip()
+{
+  cout << "Cache size                    : " << cache_sz << endl;
+  cout << "Block size                    : " << line_sz << endl;
+  cout << "Associativity                 : " << assoc << endl;
+  cout << "Read only ports               : " << num_rd_ports << endl;
+  cout << "Write only ports              : " << num_wr_ports << endl;
+  cout << "Read write ports              : " << num_rw_ports << endl;
+  cout << "Single ended read ports       : " << num_se_rd_ports << endl;
+  if (fully_assoc||pure_cam)
+  {
+          cout << "Search ports                  : " << num_search_ports << endl;
+  }
+  cout << "Cache banks (UCA)             : " << nbanks << endl;
+  cout << "Technology                    : " << F_sz_um << endl;
+  cout << "Temperature                   : " << temp << endl;
+  cout << "Tag size                      : " << tag_w << endl;
+  if (is_cache) {
+    cout << "array type                    : " << "Cache" << endl;
+  }
+  if (pure_ram) {
+    cout << "array type                    : " << "Scratch RAM" << endl;
+  }
+  if (pure_cam)
+  {
+      cout << "array type                    : " << "CAM" << endl;
+  }
+  cout << "Model as memory               : " << is_main_mem << endl;
+  cout << "Access mode                   : " << access_mode << endl;
+  cout << "Data array cell type          : " << data_arr_ram_cell_tech_type << endl;
+  cout << "Data array peripheral type    : " << data_arr_peri_global_tech_type << endl;
+  cout << "Tag array cell type           : " << tag_arr_ram_cell_tech_type << endl;
+  cout << "Tag array peripheral type     : " << tag_arr_peri_global_tech_type << endl;
+  cout << "Optimization target           : " << ed << endl;
+  cout << "Design objective (UCA wt)     : " << delay_wt << " "
+                                                << dynamic_power_wt << " " << leakage_power_wt << " " << cycle_time_wt
+                                                << " " << area_wt << endl;
+  cout << "Design objective (UCA dev)    : " << delay_dev << " "
+                                                << dynamic_power_dev << " " << leakage_power_dev << " " << cycle_time_dev
+                                                << " " << area_dev << endl;
+  if (nuca)
+    {
+    cout << "Cores                         : " << cores << endl;
+
+
+    cout << "Design objective (NUCA wt)    : " << delay_wt_nuca << " "
+                                                << dynamic_power_wt_nuca << " " << leakage_power_wt_nuca << " " << cycle_time_wt_nuca
+                                                << " " << area_wt_nuca << endl;
+    cout << "Design objective (NUCA dev)   : " << delay_dev_nuca << " "
+                                                << dynamic_power_dev_nuca << " " << leakage_power_dev_nuca << " " << cycle_time_dev_nuca
+                                       << " " << area_dev_nuca << endl;
+    }
+  cout << "Cache model                   : " << nuca << endl;
+  cout << "Nuca bank                     : " << nuca_bank_count << endl;
+  cout << "Wire inside mat               : " << wire_is_mat_type << endl;
+  cout << "Wire outside mat              : " << wire_os_mat_type << endl;
+  cout << "Interconnect projection       : " << ic_proj_type << endl;
+  cout << "Wire signalling               : " << force_wiretype << endl;
+  cout << "Print level                   : " << print_detail << endl;
+  cout << "ECC overhead                  : " << add_ecc_b_ << endl;
+  cout << "Page size                     : " << page_sz_bits << endl;
+  cout << "Burst length                  : " << burst_len << endl;
+  cout << "Internal prefetch width       : " << int_prefetch_w << endl;
+  cout << "Force cache config            : " << g_ip->force_cache_config << endl;
+  if (g_ip->force_cache_config) {
+    cout << "Ndwl                          : " << g_ip->ndwl << endl;
+    cout << "Ndbl                          : " << g_ip->ndbl << endl;
+    cout << "Nspd                          : " << g_ip->nspd << endl;
+    cout << "Ndcm                          : " << g_ip->ndcm << endl;
+    cout << "Ndsam1                        : " << g_ip->ndsam1 << endl;
+    cout << "Ndsam2                        : " << g_ip->ndsam2 << endl;
+  }
+}
+
+
+
+powerComponents operator+(const powerComponents & x, const powerComponents & y)
+{
+  powerComponents z;
+
+  z.dynamic = x.dynamic + y.dynamic;
+  z.leakage = x.leakage + y.leakage;
+  z.gate_leakage  = x.gate_leakage  + y.gate_leakage;
+  z.short_circuit = x.short_circuit + y.short_circuit;
+  z.longer_channel_leakage = x.longer_channel_leakage + y.longer_channel_leakage;
+
+  return z;
+}
+
+powerComponents operator*(const powerComponents & x, double const * const y)
+{
+  powerComponents z;
+
+  z.dynamic = x.dynamic*y[0];
+  z.leakage = x.leakage*y[1];
+  z.gate_leakage  = x.gate_leakage*y[2];
+  z.short_circuit = x.short_circuit*y[3];
+  z.longer_channel_leakage = x.longer_channel_leakage*y[1];//longer channel leakage has the same behavior as normal leakage
+
+  return z;
+}
+
+
+powerDef operator+(const powerDef & x, const powerDef & y)
+{
+  powerDef z;
+
+  z.readOp   = x.readOp  + y.readOp;
+  z.writeOp  = x.writeOp + y.writeOp;
+  z.searchOp = x.searchOp + y.searchOp;
+  return z;
+}
+
+powerDef operator*(const powerDef & x, double const * const y)
+{
+  powerDef z;
+
+  z.readOp   = x.readOp*y;
+  z.writeOp  = x.writeOp*y;
+  z.searchOp = x.searchOp*y;
+  return z;
+}
+
+uca_org_t cacti_interface(const string & infile_name)
+{
+
+  uca_org_t fin_res;
+  //uca_org_t result;
+  fin_res.valid = false;
+
+  g_ip = new InputParameter();
+  g_ip->parse_cfg(infile_name);
+  if(!g_ip->error_checking())
+          exit(0);
+  if (g_ip->print_input_args)
+    g_ip->display_ip();
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+
+//  For HighRadix Only
+//  ////  Wire wirea(g_ip->wt, 1000);
+//  ////  wirea.print_wire();
+//  ////  cout << "Wire Area " << wirea.area.get_area() << " sq. u" << endl;
+//  //  winit.print_wire();
+//  //
+//    HighRadix *hr;
+//      hr = new HighRadix();
+//      hr->compute_power();
+//      hr->print_router();
+//    exit(0);
+//
+//    double sub_switch_sz = 2;
+//    double rows = 32;
+//    for (int i=0; i<6; i++) {
+//      sub_switch_sz = pow(2, i);
+//      rows = 64/sub_switch_sz;
+//      hr = new HighRadix(sub_switch_sz, rows, .8/* freq */, 64, 2, 64, 0.7);
+//      hr->compute_power();
+//      hr->print_router();
+//      delete hr;
+//    }
+//  //  HighRadix yarc;
+//  //  yarc.compute_power();
+//  //  yarc.print_router();
+//    winit.print_wire();
+//    exit(0);
+//  For HighRadix Only End
+
+  if (g_ip->nuca == 1)
+  {
+    Nuca n(&g_tp.peri_global);
+    n.sim_nuca();
+  }
+  g_ip->display_ip();
+  solve(&fin_res);
+
+  output_UCA(&fin_res);
+  output_data_csv(fin_res);
+
+  delete (g_ip);
+  return fin_res;
+}
+
+//cacti6.5's plain interface, please keep !!!
+uca_org_t cacti_interface(
+    int cache_size,
+    int line_size,
+    int associativity,
+    int rw_ports,
+    int excl_read_ports,
+    int excl_write_ports,
+    int single_ended_read_ports,
+    int banks,
+    double tech_node, // in nm
+    int page_sz,
+    int burst_length,
+    int pre_width,
+    int output_width,
+    int specific_tag,
+    int tag_width,
+    int access_mode, //0 normal, 1 seq, 2 fast
+    int cache, //scratch ram or cache
+    int main_mem,
+    int obj_func_delay,
+    int obj_func_dynamic_power,
+    int obj_func_leakage_power,
+    int obj_func_area,
+    int obj_func_cycle_time,
+    int dev_func_delay,
+    int dev_func_dynamic_power,
+    int dev_func_leakage_power,
+    int dev_func_area,
+    int dev_func_cycle_time,
+    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+    int temp,
+    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+    int data_arr_ram_cell_tech_flavor_in, // 0-4
+    int data_arr_peri_global_tech_flavor_in,
+    int tag_arr_ram_cell_tech_flavor_in,
+    int tag_arr_peri_global_tech_flavor_in,
+    int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
+    int wire_inside_mat_type_in,
+    int wire_outside_mat_type_in,
+    int is_nuca, // 0 - UCA, 1 - NUCA
+    int core_count,
+    int cache_level, // 0 - L2, 1 - L3
+    int nuca_bank_count,
+    int nuca_obj_func_delay,
+    int nuca_obj_func_dynamic_power,
+    int nuca_obj_func_leakage_power,
+    int nuca_obj_func_area,
+    int nuca_obj_func_cycle_time,
+    int nuca_dev_func_delay,
+    int nuca_dev_func_dynamic_power,
+    int nuca_dev_func_leakage_power,
+    int nuca_dev_func_area,
+    int nuca_dev_func_cycle_time,
+    int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
+    int p_input)
+{
+  g_ip = new InputParameter();
+  g_ip->add_ecc_b_ = true;
+
+  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+
+  g_ip->ic_proj_type     = interconnect_projection_type_in;
+  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+  g_ip->burst_len        = burst_length;
+  g_ip->int_prefetch_w   = pre_width;
+  g_ip->page_sz_bits     = page_sz;
+
+  g_ip->cache_sz            = cache_size;
+  g_ip->line_sz             = line_size;
+  g_ip->assoc               = associativity;
+  g_ip->nbanks              = banks;
+  g_ip->out_w               = output_width;
+  g_ip->specific_tag        = specific_tag;
+  if (tag_width == 0) {
+    g_ip->tag_w = 42;
+  }
+  else {
+    g_ip->tag_w               = tag_width;
+  }
+
+  g_ip->access_mode         = access_mode;
+  g_ip->delay_wt = obj_func_delay;
+  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+  g_ip->leakage_power_wt = obj_func_leakage_power;
+  g_ip->area_wt = obj_func_area;
+  g_ip->cycle_time_wt    = obj_func_cycle_time;
+  g_ip->delay_dev = dev_func_delay;
+  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+  g_ip->leakage_power_dev = dev_func_leakage_power;
+  g_ip->area_dev = dev_func_area;
+  g_ip->cycle_time_dev    = dev_func_cycle_time;
+  g_ip->ed = ed_ed2_none;
+
+  switch(wt) {
+    case (0):
+      g_ip->force_wiretype = 0;
+      g_ip->wt = Global;
+      break;
+    case (1):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global;
+      break;
+    case (2):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_5;
+      break;
+    case (3):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_10;
+      break;
+    case (4):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_20;
+      break;
+    case (5):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_30;
+      break;
+    case (6):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Low_swing;
+      break;
+    default:
+      cout << "Unknown wire type!\n";
+      exit(0);
+  }
+
+  g_ip->delay_wt_nuca = nuca_obj_func_delay;
+  g_ip->dynamic_power_wt_nuca = nuca_obj_func_dynamic_power;
+  g_ip->leakage_power_wt_nuca = nuca_obj_func_leakage_power;
+  g_ip->area_wt_nuca = nuca_obj_func_area;
+  g_ip->cycle_time_wt_nuca    = nuca_obj_func_cycle_time;
+  g_ip->delay_dev_nuca = dev_func_delay;
+  g_ip->dynamic_power_dev_nuca = nuca_dev_func_dynamic_power;
+  g_ip->leakage_power_dev_nuca = nuca_dev_func_leakage_power;
+  g_ip->area_dev_nuca = nuca_dev_func_area;
+  g_ip->cycle_time_dev_nuca    = nuca_dev_func_cycle_time;
+  g_ip->nuca = is_nuca;
+  g_ip->nuca_bank_count = nuca_bank_count;
+  if(nuca_bank_count > 0) {
+    g_ip->force_nuca_bank = 1;
+  }
+  g_ip->cores = core_count;
+  g_ip->cache_level = cache_level;
+
+  g_ip->temp = temp;
+
+  g_ip->F_sz_nm         = tech_node;
+  g_ip->F_sz_um         = tech_node / 1000;
+  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+  g_ip->is_cache        = (cache != 0) ? true : false;
+  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+
+  g_ip->num_rw_ports    = rw_ports;
+  g_ip->num_rd_ports    = excl_read_ports;
+  g_ip->num_wr_ports    = excl_write_ports;
+  g_ip->num_se_rd_ports = single_ended_read_ports;
+  g_ip->print_detail = 1;
+  g_ip->nuca = 0;
+
+  g_ip->wt = Global_5;
+  g_ip->force_cache_config = false;
+  g_ip->force_wiretype = false;
+  g_ip->print_input_args = p_input;
+
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+  if (g_ip->error_checking() == false) exit(0);
+  if (g_ip->print_input_args)
+    g_ip->display_ip();
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  if (g_ip->nuca == 1)
+  {
+    Nuca n(&g_tp.peri_global);
+    n.sim_nuca();
+  }
+  solve(&fin_res);
+
+  output_UCA(&fin_res);
+
+  delete (g_ip);
+  return fin_res;
+}
+
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(
+    int cache_size,
+    int line_size,
+    int associativity,
+    int rw_ports,
+    int excl_read_ports,// para5
+    int excl_write_ports,
+    int single_ended_read_ports,
+    int search_ports,
+    int banks,
+    double tech_node,//para10
+    int output_width,
+    int specific_tag,
+    int tag_width,
+    int access_mode,
+    int cache,      //para15
+    int main_mem,
+    int obj_func_delay,
+    int obj_func_dynamic_power,
+    int obj_func_leakage_power,
+    int obj_func_cycle_time, //para20
+    int obj_func_area,
+    int dev_func_delay,
+    int dev_func_dynamic_power,
+    int dev_func_leakage_power,
+    int dev_func_area, //para25
+    int dev_func_cycle_time,
+    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+    int temp,
+    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+    int data_arr_ram_cell_tech_flavor_in,//para30
+    int data_arr_peri_global_tech_flavor_in,
+    int tag_arr_ram_cell_tech_flavor_in,
+    int tag_arr_peri_global_tech_flavor_in,
+    int interconnect_projection_type_in,
+    int wire_inside_mat_type_in,//para35
+    int wire_outside_mat_type_in,
+    int REPEATERS_IN_HTREE_SEGMENTS_in,
+    int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
+    int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
+    int PAGE_SIZE_BITS_in,//para40
+    int BURST_LENGTH_in,
+    int INTERNAL_PREFETCH_WIDTH_in,
+    int force_wiretype,
+    int wiretype,
+    int force_config,//para45
+    int ndwl,
+    int ndbl,
+    int nspd,
+    int ndcm,
+    int ndsam1,//para50
+    int ndsam2,
+    int ecc)
+{
+  g_ip = new InputParameter();
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+
+  g_ip->ic_proj_type     = interconnect_projection_type_in;
+  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+  g_ip->burst_len        = BURST_LENGTH_in;
+  g_ip->int_prefetch_w   = INTERNAL_PREFETCH_WIDTH_in;
+  g_ip->page_sz_bits     = PAGE_SIZE_BITS_in;
+
+  g_ip->cache_sz            = cache_size;
+  g_ip->line_sz             = line_size;
+  g_ip->assoc               = associativity;
+  g_ip->nbanks              = banks;
+  g_ip->out_w               = output_width;
+  g_ip->specific_tag        = specific_tag;
+  if (specific_tag == 0) {
+    g_ip->tag_w = 42;
+  }
+  else {
+    g_ip->tag_w               = tag_width;
+  }
+
+  g_ip->access_mode         = access_mode;
+  g_ip->delay_wt = obj_func_delay;
+  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+  g_ip->leakage_power_wt = obj_func_leakage_power;
+  g_ip->area_wt = obj_func_area;
+  g_ip->cycle_time_wt    = obj_func_cycle_time;
+  g_ip->delay_dev = dev_func_delay;
+  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+  g_ip->leakage_power_dev = dev_func_leakage_power;
+  g_ip->area_dev = dev_func_area;
+  g_ip->cycle_time_dev    = dev_func_cycle_time;
+  g_ip->temp = temp;
+  g_ip->ed = ed_ed2_none;
+
+  g_ip->F_sz_nm         = tech_node;
+  g_ip->F_sz_um         = tech_node / 1000;
+  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+  g_ip->is_cache        = (cache ==1) ? true : false;
+  g_ip->pure_ram        = (cache ==0) ? true : false;
+  g_ip->pure_cam        = (cache ==2) ? true : false;
+  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+  g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in;
+  g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in;
+
+  g_ip->num_rw_ports    = rw_ports;
+  g_ip->num_rd_ports    = excl_read_ports;
+  g_ip->num_wr_ports    = excl_write_ports;
+  g_ip->num_se_rd_ports = single_ended_read_ports;
+  g_ip->num_search_ports = search_ports;
+
+  g_ip->print_detail = 1;
+  g_ip->nuca = 0;
+
+  if (force_wiretype == 0)
+  {
+          g_ip->wt = Global;
+      g_ip->force_wiretype = false;
+  }
+  else
+  {   g_ip->force_wiretype = true;
+          if (wiretype==10) {
+                  g_ip->wt = Global_10;
+                }
+          if (wiretype==20) {
+                  g_ip->wt = Global_20;
+                }
+          if (wiretype==30) {
+                  g_ip->wt = Global_30;
+                }
+          if (wiretype==5) {
+              g_ip->wt = Global_5;
+                }
+          if (wiretype==0) {
+                  g_ip->wt = Low_swing;
+          }
+  }
+  //g_ip->wt = Global_5;
+  if (force_config == 0)
+    {
+          g_ip->force_cache_config = false;
+    }
+    else
+    {
+        g_ip->force_cache_config = true;
+        g_ip->ndbl=ndbl;
+        g_ip->ndwl=ndwl;
+        g_ip->nspd=nspd;
+        g_ip->ndcm=ndcm;
+        g_ip->ndsam1=ndsam1;
+        g_ip->ndsam2=ndsam2;
+
+
+    }
+
+  if (ecc==0){
+          g_ip->add_ecc_b_=false;
+  }
+  else
+  {
+          g_ip->add_ecc_b_=true;
+  }
+
+
+  if(!g_ip->error_checking())
+          exit(0);
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  g_ip->display_ip();
+  solve(&fin_res);
+  output_UCA(&fin_res);
+  output_data_csv(fin_res);
+  delete (g_ip);
+
+  return fin_res;
+}
+
+
+
+bool InputParameter::error_checking()
+{
+  int  A;
+  bool seq_access  = false;
+  fast_access = true;
+
+  switch (access_mode)
+  {
+    case 0:
+      seq_access  = false;
+      fast_access = false;
+      break;
+    case 1:
+      seq_access  = true;
+      fast_access = false;
+      break;
+    case 2:
+      seq_access  = false;
+      fast_access = true;
+      break;
+  }
+
+  if(is_main_mem)
+  {
+    if(ic_proj_type == 0)
+    {
+      cerr << "DRAM model supports only conservative interconnect projection!\n\n";
+      return false;
+    }
+  }
+
+
+  uint32_t B = line_sz;
+
+  if (B < 1)
+  {
+    cerr << "Block size must >= 1" << endl;
+    return false;
+  }
+  else if (B*8 < out_w)
+  {
+    cerr << "Block size must be at least " << out_w/8 << endl;
+    return false;
+  }
+
+  if (F_sz_um <= 0)
+  {
+    cerr << "Feature size must be > 0" << endl;
+    return false;
+  }
+  else if (F_sz_um > 0.091)
+  {
+    cerr << "Feature size must be <= 90 nm" << endl;
+    return false;
+  }
+
+
+  uint32_t RWP  = num_rw_ports;
+  uint32_t ERP  = num_rd_ports;
+  uint32_t EWP  = num_wr_ports;
+  uint32_t NSER = num_se_rd_ports;
+  uint32_t SCHP = num_search_ports;
+
+//TODO: revisit this. This is an important feature. Sheng thought this should be used
+//  // If multiple banks and multiple ports are specified, then if number of ports is less than or equal to
+//  // the number of banks, we assume that the multiple ports are implemented via the multiple banks.
+//  // In such a case we assume that each bank has 1 RWP port.
+//  if ((RWP + ERP + EWP) <= nbanks && nbanks>1)
+//  {
+//    RWP  = 1;
+//    ERP  = 0;
+//    EWP  = 0;
+//    NSER = 0;
+//  }
+//  else if ((RWP < 0) || (EWP < 0) || (ERP < 0))
+//  {
+//    cerr << "Ports must >=0" << endl;
+//    return false;
+//  }
+//  else if (RWP > 2)
+//  {
+//    cerr << "Maximum of 2 read/write ports" << endl;
+//    return false;
+//  }
+//  else if ((RWP+ERP+EWP) < 1)
+  // Changed to new implementation:
+  // The number of ports specified at input is per bank
+  if ((RWP+ERP+EWP) < 1)
+  {
+    cerr << "Must have at least one port" << endl;
+    return false;
+  }
+
+  if (is_pow2(nbanks) == false)
+  {
+    cerr << "Number of subbanks should be greater than or equal to 1 and should be a power of 2" << endl;
+    return false;
+  }
+
+  int C = cache_sz/nbanks;
+  if (C < 64)
+  {
+    cerr << "Cache size must >=64" << endl;
+    return false;
+  }
+
+//TODO: revisit this
+//   if (pure_ram==true && assoc!=1)
+//    {
+//       cerr << "Pure RAM must have assoc as 1" << endl;
+//       return false;
+//    }
+
+    //fully assoc and cam check
+    if (is_cache && assoc==0)
+        fully_assoc =true;
+    else
+        fully_assoc = false;
+
+    if (pure_cam==true && assoc!=0)
+    {
+          cerr << "Pure CAM must have associativity as 0" << endl;
+          return false;
+    }
+
+    if (assoc==0 && (pure_cam==false && is_cache ==false))
+    {
+          cerr << "Only CAM or Fully associative cache can have associativity as 0" << endl;
+          return false;
+    }
+
+    if ((fully_assoc==true || pure_cam==true)
+                  &&  (data_arr_ram_cell_tech_type!= tag_arr_ram_cell_tech_type
+                                 || data_arr_peri_global_tech_type != tag_arr_peri_global_tech_type  ))
+    {
+          cerr << "CAM and fully associative cache must have same device type for both data and tag array" << endl;
+          return false;
+    }
+
+    if ((fully_assoc==true || pure_cam==true)
+                  &&  (data_arr_ram_cell_tech_type== lp_dram || data_arr_ram_cell_tech_type== comm_dram))
+    {
+          cerr << "DRAM based CAM and fully associative cache are not supported" << endl;
+          return false;
+    }
+
+    if ((fully_assoc==true || pure_cam==true)
+                  &&  (is_main_mem==true))
+    {
+          cerr << "CAM and fully associative cache cannot be as main memory" << endl;
+          return false;
+    }
+
+    if ((fully_assoc || pure_cam) && SCHP<1)
+    {
+          cerr << "CAM and fully associative must have at least 1 search port" << endl;
+          return false;
+    }
+
+   if (RWP==0 && ERP==0 && SCHP>0 && ((fully_assoc || pure_cam)))
+    {
+          ERP=SCHP;
+    }
+
+//    if ((!(fully_assoc || pure_cam)) && SCHP>=1)
+//    {
+//       cerr << "None CAM and fully associative cannot have search ports" << endl;
+//       return false;
+//    }
+
+  if (assoc == 0)
+  {
+    A = C/B;
+    //fully_assoc = true;
+  }
+  else
+  {
+    if (assoc == 1)
+    {
+      A = 1;
+      //fully_assoc = false;
+    }
+    else
+    {
+      //fully_assoc = false;
+      A = assoc;
+      if (is_pow2(A) == false)
+      {
+        cerr << "Associativity must be a power of 2" << endl;
+        return false;
+      }
+    }
+  }
+
+  if (C/(B*A) <= 1 && assoc!=0)
+  {
+    cerr << "Number of sets is too small: " << endl;
+    cerr << " Need to either increase cache size, or decrease associativity or block size" << endl;
+    cerr << " (or use fully associative cache)" << endl;
+    return false;
+  }
+
+  block_sz = B;
+
+  /*dt: testing sequential access mode*/
+  if(seq_access)
+  {
+    tag_assoc  = A;
+    data_assoc = 1;
+    is_seq_acc = true;
+  }
+  else
+  {
+    tag_assoc  = A;
+    data_assoc = A;
+    is_seq_acc = false;
+  }
+
+  if (assoc==0)
+  {
+    data_assoc = 1;
+  }
+  num_rw_ports     = RWP;
+  num_rd_ports     = ERP;
+  num_wr_ports     = EWP;
+  num_se_rd_ports  = NSER;
+  if (!(fully_assoc || pure_cam))
+    num_search_ports = 0;
+  nsets            = C/(B*A);
+
+  if (temp < 300 || temp > 400 || temp%10 != 0)
+  {
+    cerr << temp << " Temperature must be between 300 and 400 Kelvin and multiple of 10." << endl;
+    return false;
+  }
+
+  if (nsets < 1)
+  {
+    cerr << "Less than one set..." << endl;
+    return false;
+  }
+
+  return true;
+}
+
+
+
+void output_data_csv(const uca_org_t & fin_res)
+{
+  //TODO: the csv output should remain
+  fstream file("out.csv", ios::in);
+  bool    print_index = file.fail();
+  file.close();
+
+  file.open("out.csv", ios::out|ios::app);
+  if (file.fail() == true)
+  {
+    cerr << "File out.csv could not be opened successfully" << endl;
+  }
+  else
+  {
+    if (print_index == true)
+    {
+      file << "Tech node (nm), ";
+      file << "Capacity (bytes), ";
+      file << "Number of banks, ";
+      file << "Associativity, ";
+      file << "Output width (bits), ";
+      file << "Access time (ns), ";
+      file << "Random cycle time (ns), ";
+//      file << "Multisubbank interleave cycle time (ns), ";
+
+//      file << "Delay request network (ns), ";
+//      file << "Delay inside mat (ns), ";
+//      file << "Delay reply network (ns), ";
+//      file << "Tag array access time (ns), ";
+//      file << "Data array access time (ns), ";
+//      file << "Refresh period (microsec), ";
+//      file << "DRAM array availability (%), ";
+      file << "Dynamic search energy (nJ), ";
+      file << "Dynamic read energy (nJ), ";
+      file << "Dynamic write energy (nJ), ";
+//      file << "Tag Dynamic read energy (nJ), ";
+//      file << "Data Dynamic read energy (nJ), ";
+//      file << "Dynamic read power (mW), ";
+      file << "Standby leakage per bank(mW), ";
+//      file << "Leakage per bank with leak power management (mW), ";
+//      file << "Leakage per bank with leak power management (mW), ";
+//      file << "Refresh power as percentage of standby leakage, ";
+      file << "Area (mm2), ";
+      file << "Ndwl, ";
+      file << "Ndbl, ";
+      file << "Nspd, ";
+      file << "Ndcm, ";
+      file << "Ndsam_level_1, ";
+      file << "Ndsam_level_2, ";
+      file << "Data arrary area efficiency %, ";
+      file << "Ntwl, ";
+      file << "Ntbl, ";
+      file << "Ntspd, ";
+      file << "Ntcm, ";
+      file << "Ntsam_level_1, ";
+      file << "Ntsam_level_2, ";
+      file << "Tag arrary area efficiency %, ";
+
+//      file << "Resistance per unit micron (ohm-micron), ";
+//      file << "Capacitance per unit micron (fF per micron), ";
+//      file << "Unit-length wire delay (ps), ";
+//      file << "FO4 delay (ps), ";
+//      file << "delay route to bank (including crossb delay) (ps), ";
+//      file << "Crossbar delay (ps), ";
+//      file << "Dyn read energy per access from closed page (nJ), ";
+//      file << "Dyn read energy per access from open page (nJ), ";
+//      file << "Leak power of an subbank with page closed (mW), ";
+//      file << "Leak power of a subbank with page  open (mW), ";
+//      file << "Leak power of request and reply networks (mW), ";
+//      file << "Number of subbanks, ";
+//      file << "Page size in bits, ";
+//      file << "Activate power, ";
+//      file << "Read power, ";
+//      file << "Write power, ";
+//      file << "Precharge power, ";
+//      file << "tRCD, ";
+//      file << "CAS latency, ";
+//      file << "Precharge delay, ";
+//      file << "Perc dyn energy bitlines, ";
+//      file << "perc dyn energy wordlines, ";
+//      file << "perc dyn energy outside mat, ";
+//      file << "Area opt (perc), ";
+//      file << "Delay opt (perc), ";
+//      file << "Repeater opt (perc), ";
+//      file << "Aspect ratio";
+      file << endl;
+    }
+    file << g_ip->F_sz_nm << ", ";
+    file << g_ip->cache_sz << ", ";
+    file << g_ip->nbanks << ", ";
+    file << g_ip->tag_assoc << ", ";
+    file << g_ip->out_w << ", ";
+    file << fin_res.access_time*1e+9 << ", ";
+    file << fin_res.cycle_time*1e+9 << ", ";
+//    file << fin_res.data_array2->multisubbank_interleave_cycle_time*1e+9 << ", ";
+//    file << fin_res.data_array2->delay_request_network*1e+9 << ", ";
+//    file << fin_res.data_array2->delay_inside_mat*1e+9 <<  ", ";
+//    file << fin_res.data_array2.delay_reply_network*1e+9 << ", ";
+
+//    if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram))
+//        {
+//       file << fin_res.tag_array2->access_time*1e+9 << ", ";
+//        }
+//    else
+//    {
+//     file << 0 << ", ";
+//    }
+//    file << fin_res.data_array2->access_time*1e+9 << ", ";
+//    file << fin_res.data_array2->dram_refresh_period*1e+6 << ", ";
+//    file << fin_res.data_array2->dram_array_availability <<  ", ";
+    if (g_ip->fully_assoc || g_ip->pure_cam)
+    {
+        file << fin_res.power.searchOp.dynamic*1e+9 << ", ";
+    }
+        else
+    {
+                file << "N/A" << ", ";
+    }
+    file << fin_res.power.readOp.dynamic*1e+9 << ", ";
+    file << fin_res.power.writeOp.dynamic*1e+9 << ", ";
+//    if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram))
+//        {
+//             file << fin_res.tag_array2->power.readOp.dynamic*1e+9 << ", ";
+//        }
+//             else
+//        {
+//                     file << "NA" << ", ";
+//        }
+//    file << fin_res.data_array2->power.readOp.dynamic*1e+9 << ", ";
+//    if (g_ip->fully_assoc || g_ip->pure_cam)
+//        {
+//         file << fin_res.power.searchOp.dynamic*1000/fin_res.cycle_time << ", ";
+//        }
+//             else
+//        {
+//             file << fin_res.power.readOp.dynamic*1000/fin_res.cycle_time << ", ";
+//        }
+
+    file <<( fin_res.power.readOp.leakage + fin_res.power.readOp.gate_leakage )*1000 << ", ";
+//    file << fin_res.leak_power_with_sleep_transistors_in_mats*1000 << ", ";
+//    file << fin_res.data_array.refresh_power / fin_res.data_array.total_power.readOp.leakage << ", ";
+    file << fin_res.area*1e-6 << ", ";
+
+    file << fin_res.data_array2->Ndwl << ", ";
+    file << fin_res.data_array2->Ndbl << ", ";
+    file << fin_res.data_array2->Nspd << ", ";
+    file << fin_res.data_array2->deg_bl_muxing << ", ";
+    file << fin_res.data_array2->Ndsam_lev_1 << ", ";
+    file << fin_res.data_array2->Ndsam_lev_2 << ", ";
+    file << fin_res.data_array2->area_efficiency << ", ";
+    if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram))
+    {
+    file << fin_res.tag_array2->Ndwl << ", ";
+    file << fin_res.tag_array2->Ndbl << ", ";
+    file << fin_res.tag_array2->Nspd << ", ";
+    file << fin_res.tag_array2->deg_bl_muxing << ", ";
+    file << fin_res.tag_array2->Ndsam_lev_1 << ", ";
+    file << fin_res.tag_array2->Ndsam_lev_2 << ", ";
+    file << fin_res.tag_array2->area_efficiency << ", ";
+    }
+    else
+    {
+        file << "N/A" << ", ";
+        file << "N/A"<< ", ";
+        file << "N/A" << ", ";
+        file << "N/A" << ", ";
+        file << "N/A" << ", ";
+        file << "N/A" << ", ";
+        file << "N/A" << ", ";
+    }
+
+//    file << g_tp.wire_inside_mat.R_per_um << ", ";
+//    file << g_tp.wire_inside_mat.C_per_um / 1e-15 << ", ";
+//    file << g_tp.unit_len_wire_del / 1e-12 << ", ";
+//    file << g_tp.FO4 / 1e-12 << ", ";
+//    file << fin_res.data_array.delay_route_to_bank / 1e-9 << ", ";
+//    file << fin_res.data_array.delay_crossbar / 1e-9 << ", ";
+//    file << fin_res.data_array.dyn_read_energy_from_closed_page / 1e-9 << ", ";
+//    file << fin_res.data_array.dyn_read_energy_from_open_page / 1e-9 << ", ";
+//    file << fin_res.data_array.leak_power_subbank_closed_page / 1e-3 << ", ";
+//    file << fin_res.data_array.leak_power_subbank_open_page / 1e-3 << ", ";
+//    file << fin_res.data_array.leak_power_request_and_reply_networks / 1e-3 << ", ";
+//    file << fin_res.data_array.number_subbanks << ", " ;
+//    file << fin_res.data_array.page_size_in_bits << ", " ;
+//    file << fin_res.data_array.activate_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.read_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.write_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.precharge_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.trcd * 1e9 << ", " ;
+//    file << fin_res.data_array.cas_latency * 1e9 << ", " ;
+//    file << fin_res.data_array.precharge_delay * 1e9 << ", " ;
+//    file << fin_res.data_array.all_banks_height / fin_res.data_array.all_banks_width;
+    file<<endl;
+  }
+  file.close();
+}
+
+
+
+void output_UCA(uca_org_t *fr)
+{
+  //    if (NUCA)
+  if (0) {
+    cout << "\n\n Detailed Bank Stats:\n";
+    cout << "    Bank Size (bytes): %d\n" <<
+                                     (int) (g_ip->cache_sz);
+  }
+  else {
+    if (g_ip->data_arr_ram_cell_tech_type == 3) {
+      cout << "\n---------- CACTI version 6.5, Uniform Cache Access " <<
+        "Logic Process Based DRAM Model ----------\n";
+    }
+    else if (g_ip->data_arr_ram_cell_tech_type == 4) {
+      cout << "\n---------- CACTI version 6.5, Uniform" <<
+        "Cache Access Commodity DRAM Model ----------\n";
+    }
+    else {
+      cout << "\n---------- CACTI version 6.5, Uniform Cache Access "
+        "SRAM Model ----------\n";
+    }
+    cout << "\nCache Parameters:\n";
+    cout << "    Total cache size (bytes): " <<
+      (int) (g_ip->cache_sz) << endl;
+  }
+
+  cout << "    Number of banks: " << (int) g_ip->nbanks << endl;
+  if (g_ip->fully_assoc|| g_ip->pure_cam)
+    cout << "    Associativity: fully associative\n";
+  else {
+    if (g_ip->tag_assoc == 1)
+      cout << "    Associativity: direct mapped\n";
+    else
+      cout << "    Associativity: " <<
+        g_ip->tag_assoc << endl;
+  }
+
+
+  cout << "    Block size (bytes): " << g_ip->line_sz << endl;
+  cout << "    Read/write Ports: " <<
+    g_ip->num_rw_ports << endl;
+  cout << "    Read ports: " <<
+    g_ip->num_rd_ports << endl;
+  cout << "    Write ports: " <<
+    g_ip->num_wr_ports << endl;
+  if (g_ip->fully_assoc|| g_ip->pure_cam)
+          cout << "    search ports: " <<
+              g_ip->num_search_ports << endl;
+  cout << "    Technology size (nm): " <<
+    g_ip->F_sz_nm << endl << endl;
+
+  cout << "    Access time (ns): " << fr->access_time*1e9 << endl;
+  cout << "    Cycle time (ns):  " << fr->cycle_time*1e9 << endl;
+  if (g_ip->data_arr_ram_cell_tech_type >= 4) {
+    cout << "    Precharge Delay (ns): " << fr->data_array2->precharge_delay*1e9 << endl;
+    cout << "    Activate Energy (nJ): " << fr->data_array2->activate_energy*1e9 << endl;
+    cout << "    Read Energy (nJ): " << fr->data_array2->read_energy*1e9 << endl;
+    cout << "    Write Energy (nJ): " << fr->data_array2->write_energy*1e9 << endl;
+    cout << "    Precharge Energy (nJ): " << fr->data_array2->precharge_energy*1e9 << endl;
+    cout << "    Leakage Power Closed Page (mW): " << fr->data_array2->leak_power_subbank_closed_page*1e3 << endl;
+    cout << "    Leakage Power Open Page (mW): " << fr->data_array2->leak_power_subbank_open_page*1e3 << endl;
+    cout << "    Leakage Power I/O (mW): " << fr->data_array2->leak_power_request_and_reply_networks*1e3 << endl;
+    cout << "    Refresh power (mW): " <<
+      fr->data_array2->refresh_power*1e3 << endl;
+  }
+  else {
+          if ((g_ip->fully_assoc|| g_ip->pure_cam))
+          {
+                  cout << "    Total dynamic associative search energy per access (nJ): " <<
+                  fr->power.searchOp.dynamic*1e9 << endl;
+//               cout << "    Total dynamic read energy per access (nJ): " <<
+//               fr->power.readOp.dynamic*1e9 << endl;
+//               cout << "    Total dynamic write energy per access (nJ): " <<
+//               fr->power.writeOp.dynamic*1e9 << endl;
+          }
+//       else
+//       {
+                  cout << "    Total dynamic read energy per access (nJ): " <<
+                  fr->power.readOp.dynamic*1e9 << endl;
+                  cout << "    Total dynamic write energy per access (nJ): " <<
+                  fr->power.writeOp.dynamic*1e9 << endl;
+//       }
+          cout << "    Total leakage power of a bank"
+          " (mW): " << fr->power.readOp.leakage*1e3 << endl;
+          cout << "    Total gate leakage power of a bank"
+          " (mW): " << fr->power.readOp.gate_leakage*1e3 << endl;
+  }
+
+  if (g_ip->data_arr_ram_cell_tech_type ==3 || g_ip->data_arr_ram_cell_tech_type ==4)
+  {
+  }
+  cout <<  "    Cache height x width (mm): " <<
+    fr->cache_ht*1e-3 << " x " << fr->cache_len*1e-3 << endl << endl;
+
+
+  cout << "    Best Ndwl : " << fr->data_array2->Ndwl << endl;
+  cout << "    Best Ndbl : " << fr->data_array2->Ndbl << endl;
+  cout << "    Best Nspd : " << fr->data_array2->Nspd << endl;
+  cout << "    Best Ndcm : " << fr->data_array2->deg_bl_muxing << endl;
+  cout << "    Best Ndsam L1 : " << fr->data_array2->Ndsam_lev_1 << endl;
+  cout << "    Best Ndsam L2 : " << fr->data_array2->Ndsam_lev_2 << endl << endl;
+
+  if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+  {
+    cout << "    Best Ntwl : " << fr->tag_array2->Ndwl << endl;
+    cout << "    Best Ntbl : " << fr->tag_array2->Ndbl << endl;
+    cout << "    Best Ntspd : " << fr->tag_array2->Nspd << endl;
+    cout << "    Best Ntcm : " << fr->tag_array2->deg_bl_muxing << endl;
+    cout << "    Best Ntsam L1 : " << fr->tag_array2->Ndsam_lev_1 << endl;
+    cout << "    Best Ntsam L2 : " << fr->tag_array2->Ndsam_lev_2 << endl;
+  }
+
+  switch (fr->data_array2->wt) {
+    case (0):
+      cout <<  "    Data array, H-tree wire type: Delay optimized global wires\n";
+      break;
+    case (1):
+      cout <<  "    Data array, H-tree wire type: Global wires with 5\% delay penalty\n";
+      break;
+    case (2):
+      cout <<  "    Data array, H-tree wire type: Global wires with 10\% delay penalty\n";
+      break;
+    case (3):
+      cout <<  "    Data array, H-tree wire type: Global wires with 20\% delay penalty\n";
+      break;
+    case (4):
+      cout <<  "    Data array, H-tree wire type: Global wires with 30\% delay penalty\n";
+      break;
+    case (5):
+      cout <<  "    Data array, wire type: Low swing wires\n";
+      break;
+    default:
+      cout << "ERROR - Unknown wire type " << (int) fr->data_array2->wt <<endl;
+      exit(0);
+  }
+
+  if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) {
+    switch (fr->tag_array2->wt) {
+      case (0):
+        cout <<  "    Tag array, H-tree wire type: Delay optimized global wires\n";
+        break;
+      case (1):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 5\% delay penalty\n";
+        break;
+      case (2):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 10\% delay penalty\n";
+        break;
+      case (3):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 20\% delay penalty\n";
+        break;
+      case (4):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 30\% delay penalty\n";
+        break;
+      case (5):
+        cout <<  "    Tag array, wire type: Low swing wires\n";
+        break;
+      default:
+        cout << "ERROR - Unknown wire type " << (int) fr->tag_array2->wt <<endl;
+        exit(-1);
+    }
+  }
+
+  if (g_ip->print_detail)
+  {
+    //if(g_ip->fully_assoc) return;
+
+    /* Delay stats */
+    /* data array stats */
+    cout << endl << "Time Components:" << endl << endl;
+
+    cout << "  Data side (with Output driver) (ns): " <<
+      fr->data_array2->access_time/1e-9 << endl;
+
+    cout <<  "\tH-tree input delay (ns): " <<
+      fr->data_array2->delay_route_to_bank * 1e9 +
+      fr->data_array2->delay_input_htree * 1e9 << endl;
+
+    if (!(g_ip->pure_cam || g_ip->fully_assoc))
+    {
+      cout <<  "\tDecoder + wordline delay (ns): " <<
+        fr->data_array2->delay_row_predecode_driver_and_block * 1e9 +
+        fr->data_array2->delay_row_decoder * 1e9 << endl;
+    }
+    else
+    {
+        cout <<  "\tCAM search delay (ns): " <<
+          fr->data_array2->delay_matchlines * 1e9 << endl;
+    }
+
+    cout <<  "\tBitline delay (ns): " <<
+      fr->data_array2->delay_bitlines/1e-9 << endl;
+
+    cout <<  "\tSense Amplifier delay (ns): " <<
+      fr->data_array2->delay_sense_amp * 1e9 << endl;
+
+
+    cout <<  "\tH-tree output delay (ns): " <<
+      fr->data_array2->delay_subarray_output_driver * 1e9 +
+      fr->data_array2->delay_dout_htree * 1e9 << endl;
+
+    if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+    {
+      /* tag array stats */
+      cout << endl << "  Tag side (with Output driver) (ns): " <<
+        fr->tag_array2->access_time/1e-9 << endl;
+
+      cout <<  "\tH-tree input delay (ns): " <<
+        fr->tag_array2->delay_route_to_bank * 1e9 +
+        fr->tag_array2->delay_input_htree * 1e9 << endl;
+
+      cout <<  "\tDecoder + wordline delay (ns): " <<
+        fr->tag_array2->delay_row_predecode_driver_and_block * 1e9 +
+        fr->tag_array2->delay_row_decoder * 1e9 << endl;
+
+      cout <<  "\tBitline delay (ns): " <<
+        fr->tag_array2->delay_bitlines/1e-9 << endl;
+
+      cout <<  "\tSense Amplifier delay (ns): " <<
+        fr->tag_array2->delay_sense_amp * 1e9 << endl;
+
+      cout <<  "\tComparator delay (ns): " <<
+        fr->tag_array2->delay_comparator * 1e9 << endl;
+
+      cout <<  "\tH-tree output delay (ns): " <<
+        fr->tag_array2->delay_subarray_output_driver * 1e9 +
+        fr->tag_array2->delay_dout_htree * 1e9 << endl;
+    }
+
+
+
+    /* Energy/Power stats */
+    cout << endl << endl << "Power Components:" << endl << endl;
+
+    if (!(g_ip->pure_cam || g_ip->fully_assoc))
+    {
+        cout << "  Data array: Total dynamic read energy/access  (nJ): " <<
+              fr->data_array2->power.readOp.dynamic * 1e9 << endl;
+        cout << "\tTotal leakage read/write power of a bank (mW): " <<
+                fr->data_array2->power.readOp.leakage * 1e3 << endl;
+
+        cout << "\tTotal energy in H-tree (that includes both "
+              "address and data transfer) (nJ): " <<
+                (fr->data_array2->power_addr_input_htree.readOp.dynamic +
+                 fr->data_array2->power_data_output_htree.readOp.dynamic +
+                 fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+
+        cout << "\tTotal leakage power in H-tree (that includes both "
+              "address and data network) ((mW)): " <<
+                (fr->data_array2->power_addr_input_htree.readOp.leakage +
+                 fr->data_array2->power_data_output_htree.readOp.leakage +
+                 fr->data_array2->power_routing_to_bank.readOp.leakage) * 1e3 << endl;
+
+        cout << "\tTotal gate leakage power in H-tree (that includes both "
+              "address and data network) ((mW)): " <<
+                (fr->data_array2->power_addr_input_htree.readOp.gate_leakage +
+                 fr->data_array2->power_data_output_htree.readOp.gate_leakage +
+                 fr->data_array2->power_routing_to_bank.readOp.gate_leakage) * 1e3 << endl;
+
+        cout << "\tOutput Htree inside bank Energy (nJ): " <<
+           fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tDecoder (nJ): " <<
+           fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+           fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tWordline (nJ): " <<
+           fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tBitline mux & associated drivers (nJ): " <<
+           fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+           fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+           fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+           fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+           fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+           fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+           fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+           fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+           fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+
+        cout <<  "\tBitlines precharge and equalization circuit (nJ): " <<
+                   fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tBitlines (nJ): " <<
+           fr->data_array2->power_bitlines.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tSense amplifier energy (nJ): " <<
+           fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+        cout <<  "\tSub-array output driver (nJ): " <<
+           fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+    }
+
+        else if (g_ip->pure_cam)
+        {
+
+                cout << "  CAM array:"<<endl;
+                cout << "  Total dynamic associative search energy/access  (nJ): " <<
+                      fr->data_array2->power.searchOp.dynamic * 1e9 << endl;
+                cout << "\tTotal energy in H-tree (that includes both "
+                              "match key and data transfer) (nJ): " <<
+                      (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+                       fr->data_array2->power_htree_out_search.searchOp.dynamic +
+                       fr->data_array2->power_routing_to_bank.searchOp.dynamic) * 1e9 << endl;
+                cout << "\tKeyword input and result output Htrees inside bank Energy (nJ): " <<
+                      (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+                               fr->data_array2->power_htree_out_search.searchOp.dynamic) * 1e9 << endl;
+                cout <<  "\tSearchlines (nJ): " <<
+                           fr->data_array2->power_searchline.searchOp.dynamic * 1e9 +
+                           fr->data_array2->power_searchline_precharge.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tMatchlines  (nJ): " <<
+                       fr->data_array2->power_matchlines.searchOp.dynamic * 1e9 +
+                           fr->data_array2->power_matchline_precharge.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tSub-array output driver (nJ): " <<
+                           fr->data_array2->power_output_drivers_at_subarray.searchOp.dynamic * 1e9 << endl;
+
+
+                cout <<endl<< "  Total dynamic read energy/access  (nJ): " <<
+                      fr->data_array2->power.readOp.dynamic * 1e9 << endl;
+                cout << "\tTotal energy in H-tree (that includes both "
+                              "address and data transfer) (nJ): " <<
+                      (fr->data_array2->power_addr_input_htree.readOp.dynamic +
+                       fr->data_array2->power_data_output_htree.readOp.dynamic +
+                       fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+                cout << "\tOutput Htree inside bank Energy (nJ): " <<
+                           fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tDecoder (nJ): " <<
+                           fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tWordline (nJ): " <<
+                           fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tBitline mux & associated drivers (nJ): " <<
+                           fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+                           fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+                           fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tBitlines (nJ): " <<
+                           fr->data_array2->power_bitlines.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9<< endl;
+                cout <<  "\tSense amplifier energy (nJ): " <<
+                           fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tSub-array output driver (nJ): " <<
+                           fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+
+                cout << endl <<"  Total leakage power of a bank (mW): " <<
+                      fr->data_array2->power.readOp.leakage * 1e3 << endl;
+        }
+        else
+        {
+                cout << "  Fully associative array:"<<endl;
+                cout << "  Total dynamic associative search energy/access  (nJ): " <<
+                  fr->data_array2->power.searchOp.dynamic * 1e9 << endl;
+                cout << "\tTotal energy in H-tree (that includes both "
+                              "match key and data transfer) (nJ): " <<
+                      (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+                       fr->data_array2->power_htree_out_search.searchOp.dynamic +
+                       fr->data_array2->power_routing_to_bank.searchOp.dynamic) * 1e9 << endl;
+                cout << "\tKeyword input and result output Htrees inside bank Energy (nJ): " <<
+                      (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+                               fr->data_array2->power_htree_out_search.searchOp.dynamic) * 1e9 << endl;
+                cout <<  "\tSearchlines (nJ): " <<
+                           fr->data_array2->power_searchline.searchOp.dynamic * 1e9 +
+                           fr->data_array2->power_searchline_precharge.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tMatchlines  (nJ): " <<
+                       fr->data_array2->power_matchlines.searchOp.dynamic * 1e9 +
+                           fr->data_array2->power_matchline_precharge.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tData portion wordline (nJ): " <<
+                           fr->data_array2->power_matchline_to_wordline_drv.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tData Bitlines (nJ): " <<
+                           fr->data_array2->power_bitlines.searchOp.dynamic * 1e9 +
+                           fr->data_array2->power_prechg_eq_drivers.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tSense amplifier energy (nJ): " <<
+                           fr->data_array2->power_sense_amps.searchOp.dynamic * 1e9 << endl;
+                cout <<  "\tSub-array output driver (nJ): " <<
+                           fr->data_array2->power_output_drivers_at_subarray.searchOp.dynamic * 1e9 << endl;
+
+
+                cout <<endl<< "  Total dynamic read energy/access  (nJ): " <<
+                      fr->data_array2->power.readOp.dynamic * 1e9 << endl;
+                cout << "\tTotal energy in H-tree (that includes both "
+                              "address and data transfer) (nJ): " <<
+                      (fr->data_array2->power_addr_input_htree.readOp.dynamic +
+                       fr->data_array2->power_data_output_htree.readOp.dynamic +
+                       fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+                cout << "\tOutput Htree inside bank Energy (nJ): " <<
+                           fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tDecoder (nJ): " <<
+                           fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tWordline (nJ): " <<
+                           fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tBitline mux & associated drivers (nJ): " <<
+                           fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+                           fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+                           fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tBitlines (nJ): " <<
+                           fr->data_array2->power_bitlines.readOp.dynamic * 1e9 +
+                           fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9<< endl;
+                cout <<  "\tSense amplifier energy (nJ): " <<
+                           fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+                cout <<  "\tSub-array output driver (nJ): " <<
+                           fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+
+                cout << endl <<"  Total leakage power of a bank (mW): " <<
+                  fr->data_array2->power.readOp.leakage * 1e3 << endl;
+      }
+
+
+    if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+    {
+      cout << endl << "  Tag array:  Total dynamic read energy/access (nJ): " <<
+        fr->tag_array2->power.readOp.dynamic * 1e9 << endl;
+      cout << "\tTotal leakage read/write power of a bank (mW): " <<
+          fr->tag_array2->power.readOp.leakage * 1e3 << endl;
+      cout << "\tTotal energy in H-tree (that includes both "
+        "address and data transfer) (nJ): " <<
+          (fr->tag_array2->power_addr_input_htree.readOp.dynamic +
+           fr->tag_array2->power_data_output_htree.readOp.dynamic +
+           fr->tag_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+
+      cout << "\tTotal leakage power in H-tree (that includes both "
+              "address and data network) ((mW)): " <<
+                (fr->tag_array2->power_addr_input_htree.readOp.leakage +
+                 fr->tag_array2->power_data_output_htree.readOp.leakage +
+                 fr->tag_array2->power_routing_to_bank.readOp.leakage) * 1e3 << endl;
+
+          cout << "\tTotal gate leakage power in H-tree (that includes both "
+              "address and data network) ((mW)): " <<
+                (fr->tag_array2->power_addr_input_htree.readOp.gate_leakage +
+                 fr->tag_array2->power_data_output_htree.readOp.gate_leakage +
+                 fr->tag_array2->power_routing_to_bank.readOp.gate_leakage) * 1e3 << endl;
+
+      cout << "\tOutput Htree inside a bank Energy (nJ): " <<
+        fr->tag_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tDecoder (nJ): " <<
+        fr->tag_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tWordline (nJ): " <<
+        fr->tag_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tBitline mux & associated drivers (nJ): " <<
+        fr->tag_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+        fr->tag_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+        fr->tag_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tBitlines precharge and equalization circuit (nJ): " <<
+        fr->tag_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tBitlines (nJ): " <<
+        fr->tag_array2->power_bitlines.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tSense amplifier energy (nJ): " <<
+        fr->tag_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tSub-array output driver (nJ): " <<
+        fr->tag_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+    }
+
+    cout << endl << endl <<  "Area Components:" << endl << endl;
+    /* Data array area stats */
+    if (!(g_ip->pure_cam || g_ip->fully_assoc))
+        cout <<  "  Data array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl;
+    else if (g_ip->pure_cam)
+        cout <<  "  CAM array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl;
+    else
+        cout <<  "  Fully associative cache array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl;
+    cout <<  "\tHeight (mm): " <<
+      fr->data_array2->all_banks_height*1e-3 << endl;
+    cout <<  "\tWidth (mm): " <<
+      fr->data_array2->all_banks_width*1e-3 << endl;
+    if (g_ip->print_detail) {
+      cout <<  "\tArea efficiency (Memory cell area/Total area) - " <<
+        fr->data_array2->area_efficiency << " %" << endl;
+      cout << "\t\tMAT Height (mm): " <<
+        fr->data_array2->mat_height*1e-3 << endl;
+      cout << "\t\tMAT Length (mm): " <<
+        fr->data_array2->mat_length*1e-3 << endl;
+      cout << "\t\tSubarray Height (mm): " <<
+        fr->data_array2->subarray_height*1e-3 << endl;
+      cout << "\t\tSubarray Length (mm): " <<
+        fr->data_array2->subarray_length*1e-3 << endl;
+    }
+
+    /* Tag array area stats */
+    if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+    {
+      cout << endl << "  Tag array: Area (mm2): " << fr->tag_array2->area * 1e-6 << endl;
+      cout <<  "\tHeight (mm): " <<
+        fr->tag_array2->all_banks_height*1e-3 << endl;
+      cout <<  "\tWidth (mm): " <<
+        fr->tag_array2->all_banks_width*1e-3 << endl;
+      if (g_ip->print_detail)
+      {
+        cout <<  "\tArea efficiency (Memory cell area/Total area) - " <<
+          fr->tag_array2->area_efficiency << " %" << endl;
+      cout << "\t\tMAT Height (mm): " <<
+        fr->tag_array2->mat_height*1e-3 << endl;
+      cout << "\t\tMAT Length (mm): " <<
+        fr->tag_array2->mat_length*1e-3 << endl;
+      cout << "\t\tSubarray Height (mm): " <<
+        fr->tag_array2->subarray_height*1e-3 << endl;
+      cout << "\t\tSubarray Length (mm): " <<
+        fr->tag_array2->subarray_length*1e-3 << endl;
+      }
+    }
+    Wire wpr;
+    wpr.print_wire();
+
+    //cout << "FO4 = " << g_tp.FO4 << endl;
+  }
+}
+
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(InputParameter  * const local_interface)
+{
+//  g_ip = new InputParameter();
+  //g_ip->add_ecc_b_ = true;
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+  g_ip = local_interface;
+
+
+//  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+//  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+//  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+//  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+//
+//  g_ip->ic_proj_type     = interconnect_projection_type_in;
+//  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+//  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+//  g_ip->burst_len        = BURST_LENGTH_in;
+//  g_ip->int_prefetch_w   = INTERNAL_PREFETCH_WIDTH_in;
+//  g_ip->page_sz_bits     = PAGE_SIZE_BITS_in;
+//
+//  g_ip->cache_sz            = cache_size;
+//  g_ip->line_sz             = line_size;
+//  g_ip->assoc               = associativity;
+//  g_ip->nbanks              = banks;
+//  g_ip->out_w               = output_width;
+//  g_ip->specific_tag        = specific_tag;
+//  if (tag_width == 0) {
+//    g_ip->tag_w = 42;
+//  }
+//  else {
+//    g_ip->tag_w               = tag_width;
+//  }
+//
+//  g_ip->access_mode         = access_mode;
+//  g_ip->delay_wt = obj_func_delay;
+//  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+//  g_ip->leakage_power_wt = obj_func_leakage_power;
+//  g_ip->area_wt = obj_func_area;
+//  g_ip->cycle_time_wt    = obj_func_cycle_time;
+//  g_ip->delay_dev = dev_func_delay;
+//  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+//  g_ip->leakage_power_dev = dev_func_leakage_power;
+//  g_ip->area_dev = dev_func_area;
+//  g_ip->cycle_time_dev    = dev_func_cycle_time;
+//  g_ip->temp = temp;
+//
+//  g_ip->F_sz_nm         = tech_node;
+//  g_ip->F_sz_um         = tech_node / 1000;
+//  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+//  g_ip->is_cache        = (cache ==1) ? true : false;
+//  g_ip->pure_ram        = (cache ==0) ? true : false;
+//  g_ip->pure_cam        = (cache ==2) ? true : false;
+//  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+//  g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in;
+//  g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in;
+//
+//  g_ip->num_rw_ports    = rw_ports;
+//  g_ip->num_rd_ports    = excl_read_ports;
+//  g_ip->num_wr_ports    = excl_write_ports;
+//  g_ip->num_se_rd_ports = single_ended_read_ports;
+//  g_ip->num_search_ports = search_ports;
+//
+//  g_ip->print_detail = 1;
+//    g_ip->nuca = 0;
+//    g_ip->is_cache=true;
+//
+//  if (force_wiretype == 0)
+//  {
+//       g_ip->wt = Global;
+//      g_ip->force_wiretype = false;
+//  }
+//  else
+//  {   g_ip->force_wiretype = true;
+//       if (wiretype==10) {
+//               g_ip->wt = Global_10;
+//             }
+//       if (wiretype==20) {
+//               g_ip->wt = Global_20;
+//             }
+//       if (wiretype==30) {
+//               g_ip->wt = Global_30;
+//             }
+//       if (wiretype==5) {
+//           g_ip->wt = Global_5;
+//             }
+//       if (wiretype==0) {
+//               g_ip->wt = Low_swing;
+//       }
+//  }
+//  //g_ip->wt = Global_5;
+//  if (force_config == 0)
+//    {
+//       g_ip->force_cache_config = false;
+//    }
+//    else
+//    {
+//     g_ip->force_cache_config = true;
+//     g_ip->ndbl=ndbl;
+//     g_ip->ndwl=ndwl;
+//     g_ip->nspd=nspd;
+//     g_ip->ndcm=ndcm;
+//     g_ip->ndsam1=ndsam1;
+//     g_ip->ndsam2=ndsam2;
+//
+//
+//    }
+//
+//  if (ecc==0){
+//       g_ip->add_ecc_b_=false;
+//  }
+//  else
+//  {
+//       g_ip->add_ecc_b_=true;
+//  }
+
+
+  g_ip->error_checking();
+
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  solve(&fin_res);
+
+//  g_ip->display_ip();
+//  output_UCA(&fin_res);
+//  output_data_csv(fin_res);
+
+ // delete (g_ip);
+
+  return fin_res;
+}
+
+//McPAT's plain interface, please keep !!!
+uca_org_t init_interface(InputParameter* const local_interface)
+{
+ // g_ip = new InputParameter();
+  //g_ip->add_ecc_b_ = true;
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+   g_ip = local_interface;
+
+
+//  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+//  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+//  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+//  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+//
+//  g_ip->ic_proj_type     = interconnect_projection_type_in;
+//  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+//  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+//  g_ip->burst_len        = BURST_LENGTH_in;
+//  g_ip->int_prefetch_w   = INTERNAL_PREFETCH_WIDTH_in;
+//  g_ip->page_sz_bits     = PAGE_SIZE_BITS_in;
+//
+//  g_ip->cache_sz            = cache_size;
+//  g_ip->line_sz             = line_size;
+//  g_ip->assoc               = associativity;
+//  g_ip->nbanks              = banks;
+//  g_ip->out_w               = output_width;
+//  g_ip->specific_tag        = specific_tag;
+//  if (tag_width == 0) {
+//    g_ip->tag_w = 42;
+//  }
+//  else {
+//    g_ip->tag_w               = tag_width;
+//  }
+//
+//  g_ip->access_mode         = access_mode;
+//  g_ip->delay_wt = obj_func_delay;
+//  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+//  g_ip->leakage_power_wt = obj_func_leakage_power;
+//  g_ip->area_wt = obj_func_area;
+//  g_ip->cycle_time_wt    = obj_func_cycle_time;
+//  g_ip->delay_dev = dev_func_delay;
+//  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+//  g_ip->leakage_power_dev = dev_func_leakage_power;
+//  g_ip->area_dev = dev_func_area;
+//  g_ip->cycle_time_dev    = dev_func_cycle_time;
+//  g_ip->temp = temp;
+//
+//  g_ip->F_sz_nm         = tech_node;
+//  g_ip->F_sz_um         = tech_node / 1000;
+//  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+//  g_ip->is_cache        = (cache ==1) ? true : false;
+//  g_ip->pure_ram        = (cache ==0) ? true : false;
+//  g_ip->pure_cam        = (cache ==2) ? true : false;
+//  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+//  g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in;
+//  g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in;
+//
+//  g_ip->num_rw_ports    = rw_ports;
+//  g_ip->num_rd_ports    = excl_read_ports;
+//  g_ip->num_wr_ports    = excl_write_ports;
+//  g_ip->num_se_rd_ports = single_ended_read_ports;
+//  g_ip->num_search_ports = search_ports;
+//
+//  g_ip->print_detail = 1;
+//  g_ip->nuca = 0;
+//
+//  if (force_wiretype == 0)
+//  {
+//       g_ip->wt = Global;
+//      g_ip->force_wiretype = false;
+//  }
+//  else
+//  {   g_ip->force_wiretype = true;
+//       if (wiretype==10) {
+//               g_ip->wt = Global_10;
+//             }
+//       if (wiretype==20) {
+//               g_ip->wt = Global_20;
+//             }
+//       if (wiretype==30) {
+//               g_ip->wt = Global_30;
+//             }
+//       if (wiretype==5) {
+//           g_ip->wt = Global_5;
+//             }
+//       if (wiretype==0) {
+//               g_ip->wt = Low_swing;
+//       }
+//  }
+//  //g_ip->wt = Global_5;
+//  if (force_config == 0)
+//    {
+//       g_ip->force_cache_config = false;
+//    }
+//    else
+//    {
+//     g_ip->force_cache_config = true;
+//     g_ip->ndbl=ndbl;
+//     g_ip->ndwl=ndwl;
+//     g_ip->nspd=nspd;
+//     g_ip->ndcm=ndcm;
+//     g_ip->ndsam1=ndsam1;
+//     g_ip->ndsam2=ndsam2;
+//
+//
+//    }
+//
+//  if (ecc==0){
+//       g_ip->add_ecc_b_=false;
+//  }
+//  else
+//  {
+//       g_ip->add_ecc_b_=true;
+//  }
+
+
+  g_ip->error_checking();
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+  //solve(&fin_res);
+  //g_ip->display_ip();
+
+  //solve(&fin_res);
+  //output_UCA(&fin_res);
+  //output_data_csv(fin_res);
+ // delete (g_ip);
+
+  return fin_res;
+}
+
+void reconfigure(InputParameter *local_interface, uca_org_t *fin_res)
+{
+  // Copy the InputParameter to global interface (g_ip) and do error checking.
+  g_ip = local_interface;
+  g_ip->error_checking();
+
+  // Initialize technology parameters
+  init_tech_params(g_ip->F_sz_um,false);
+
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  // This corresponds to solve() in the initialization process.
+  update(fin_res);
+}
diff --git a/ext/mcpat/cacti/io.h b/ext/mcpat/cacti/io.h
new file mode 100644 (file)
index 0000000..b1c2565
--- /dev/null
@@ -0,0 +1,44 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __IO_H__
+#define __IO_H__
+
+
+#include "cacti_interface.h"
+#include "const.h"
+
+void output_data_csv(const uca_org_t & fin_res);
+void output_UCA(uca_org_t * fin_res);
+
+
+#endif
diff --git a/ext/mcpat/cacti/main.cc b/ext/mcpat/cacti/main.cc
new file mode 100644 (file)
index 0000000..d6e12be
--- /dev/null
@@ -0,0 +1,191 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <iostream>
+
+#include "io.h"
+
+using namespace std;
+
+
+int main(int argc,char *argv[])
+{
+
+  uca_org_t result;
+  if (argc != 53 && argc != 55)
+  {
+    bool infile_specified = false;
+    string infile_name("");
+
+    for (int32_t i = 0; i < argc; i++)
+    {
+      if (argv[i] == string("-infile"))
+      {
+        infile_specified = true;
+        i++;
+        infile_name = argv[i];
+      }
+    }
+
+    if (infile_specified == false)
+    {
+      cerr << " Invalid arguments -- how to use CACTI:" << endl;
+      cerr << "  1) cacti -infile <input file name>" << endl;
+      cerr << "  2) cacti arg1 ... arg52 -- please refer to the README file" << endl;
+      cerr << " No. of arguments input - " << argc << endl;
+      exit(1);
+    }
+    else
+    {
+      result = cacti_interface(infile_name);
+    }
+  }
+  else if (argc == 53)
+  {
+          result = cacti_interface(atoi(argv[ 1]),
+                          atoi(argv[ 2]),
+                          atoi(argv[ 3]),
+                          atoi(argv[ 4]),
+                          atoi(argv[ 5]),
+                          atoi(argv[ 6]),
+                          atoi(argv[ 7]),
+                          atoi(argv[ 8]),
+                          atoi(argv[ 9]),
+                          atof(argv[10]),
+                          atoi(argv[11]),
+                          atoi(argv[12]),
+                          atoi(argv[13]),
+                          atoi(argv[14]),
+                          atoi(argv[15]),
+                          atoi(argv[16]),
+                          atoi(argv[17]),
+                          atoi(argv[18]),
+                          atoi(argv[19]),
+                          atoi(argv[20]),
+                          atoi(argv[21]),
+                          atoi(argv[22]),
+                          atoi(argv[23]),
+                          atoi(argv[24]),
+                          atoi(argv[25]),
+                          atoi(argv[26]),
+                          atoi(argv[27]),
+                          atoi(argv[28]),
+                          atoi(argv[29]),
+                          atoi(argv[30]),
+                          atoi(argv[31]),
+                          atoi(argv[32]),
+                          atoi(argv[33]),
+                          atoi(argv[34]),
+                          atoi(argv[35]),
+                          atoi(argv[36]),
+                          atoi(argv[37]),
+                          atoi(argv[38]),
+                          atoi(argv[39]),
+                          atoi(argv[40]),
+                          atoi(argv[41]),
+                          atoi(argv[42]),
+                          atoi(argv[43]),
+                          atoi(argv[44]),
+                          atoi(argv[45]),
+                          atoi(argv[46]),
+                          atoi(argv[47]),
+                          atoi(argv[48]),
+                          atoi(argv[49]),
+                          atoi(argv[50]),
+                          atoi(argv[51]),
+                          atoi(argv[52]));
+  }
+  else
+  {
+          result = cacti_interface(atoi(argv[ 1]),
+                          atoi(argv[ 2]),
+                          atoi(argv[ 3]),
+                          atoi(argv[ 4]),
+                          atoi(argv[ 5]),
+                          atoi(argv[ 6]),
+                          atoi(argv[ 7]),
+                          atoi(argv[ 8]),
+                          atof(argv[ 9]),
+                          atoi(argv[10]),
+                          atoi(argv[11]),
+                          atoi(argv[12]),
+                          atoi(argv[13]),
+                          atoi(argv[14]),
+                          atoi(argv[15]),
+                          atoi(argv[16]),
+                          atoi(argv[17]),
+                          atoi(argv[18]),
+                          atoi(argv[19]),
+                          atoi(argv[20]),
+                          atoi(argv[21]),
+                          atoi(argv[22]),
+                          atoi(argv[23]),
+                          atoi(argv[24]),
+                          atoi(argv[25]),
+                          atoi(argv[26]),
+                          atoi(argv[27]),
+                          atoi(argv[28]),
+                          atoi(argv[29]),
+                          atoi(argv[30]),
+                          atoi(argv[31]),
+                          atoi(argv[32]),
+                          atoi(argv[33]),
+                          atoi(argv[34]),
+                          atoi(argv[35]),
+                          atoi(argv[36]),
+                          atoi(argv[37]),
+                          atoi(argv[38]),
+                          atoi(argv[39]),
+                          atoi(argv[40]),
+                          atoi(argv[41]),
+                          atoi(argv[42]),
+                          atoi(argv[43]),
+                          atoi(argv[44]),
+                          atoi(argv[45]),
+                          atoi(argv[46]),
+                          atoi(argv[47]),
+                          atoi(argv[48]),
+                          atoi(argv[49]),
+                          atoi(argv[50]),
+                          atoi(argv[51]),
+                          atoi(argv[52]),
+                          atoi(argv[53]),
+                          atoi(argv[54]));
+  }
+
+  result.cleanup();
+//  delete result.data_array2;
+//  if (result.tag_array2!=NULL)
+//       delete result.tag_array2;
+
+  return 0;
+}
+
diff --git a/ext/mcpat/cacti/makefile b/ext/mcpat/cacti/makefile
new file mode 100644 (file)
index 0000000..2728691
--- /dev/null
@@ -0,0 +1,28 @@
+TAR = cacti
+
+.PHONY: dbg opt depend clean clean_dbg clean_opt
+
+all: opt
+
+dbg: $(TAR).mk obj_dbg
+       @$(MAKE) TAG=dbg -C . -f $(TAR).mk
+
+opt: $(TAR).mk obj_opt
+       @$(MAKE) TAG=opt -C . -f $(TAR).mk
+
+obj_dbg:
+       mkdir $@
+
+obj_opt:
+       mkdir $@
+
+clean: clean_dbg clean_opt
+
+clean_dbg: obj_dbg
+       @$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
+       rm -rf $<
+
+clean_opt: obj_opt
+       @$(MAKE) TAG=opt -C . -f $(TAR).mk clean
+       rm -rf $<
+
diff --git a/ext/mcpat/cacti/mat.cc b/ext/mcpat/cacti/mat.cc
new file mode 100755 (executable)
index 0000000..ef98107
--- /dev/null
@@ -0,0 +1,1748 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <cassert>
+
+#include "mat.h"
+
+Mat::Mat(const DynamicParameter & dyn_p)
+ :dp(dyn_p),
+  power_subarray_out_drv(),
+  delay_fa_tag(0), delay_cam(0),
+  delay_before_decoder(0), delay_bitline(0),
+  delay_wl_reset(0), delay_bl_restore(0),
+  delay_searchline(0), delay_matchchline(0),
+  delay_cam_sl_restore(0), delay_cam_ml_reset(0),
+  delay_fa_ram_wl(0),delay_hit_miss_reset(0),
+  delay_hit_miss(0),
+  subarray(dp, dp.fully_assoc),
+  power_bitline(), per_bitline_read_energy(0),
+  deg_bl_muxing(dp.deg_bl_muxing),
+  num_act_mats_hor_dir(dyn_p.num_act_mats_hor_dir),
+  delay_writeback(0),
+  cell(subarray.cell), cam_cell(subarray.cam_cell),
+  is_dram(dyn_p.is_dram),
+  pure_cam(dyn_p.pure_cam),
+  num_mats(dp.num_mats),
+  power_sa(), delay_sa(0),
+  leak_power_sense_amps_closed_page_state(0),
+  leak_power_sense_amps_open_page_state(0),
+  delay_subarray_out_drv(0),
+  delay_comparator(0), power_comparator(),
+  num_do_b_mat(dyn_p.num_do_b_mat), num_so_b_mat(dyn_p.num_so_b_mat),
+  num_subarrays_per_mat(dp.num_subarrays/dp.num_mats),
+  num_subarrays_per_row(dp.Ndwl/dp.num_mats_h_dir)
+{
+  assert(num_subarrays_per_mat <= 4);
+  assert(num_subarrays_per_row <= 2);
+  is_fa = (dp.fully_assoc) ? true : false;
+  camFlag = (is_fa || pure_cam);//although cam_cell.w = cell.w for fa, we still differentiate them.
+
+  if (is_fa || pure_cam)
+          num_subarrays_per_row = num_subarrays_per_mat>2?num_subarrays_per_mat/2:num_subarrays_per_mat;
+
+  if (dp.use_inp_params == 1) {
+          RWP  = dp.num_rw_ports;
+          ERP  = dp.num_rd_ports;
+          EWP  = dp.num_wr_ports;
+          SCHP = dp.num_search_ports;
+  }
+  else {
+    RWP = g_ip->num_rw_ports;
+    ERP = g_ip->num_rd_ports;
+    EWP = g_ip->num_wr_ports;
+    SCHP = g_ip->num_search_ports;
+
+  }
+
+  double number_sa_subarray;
+
+  if (!is_fa && !pure_cam)
+  {
+          number_sa_subarray = subarray.num_cols / deg_bl_muxing;
+  }
+  else if (is_fa && !pure_cam)
+  {
+          number_sa_subarray =  (subarray.num_cols_fa_cam + subarray.num_cols_fa_ram) / deg_bl_muxing;
+  }
+
+  else
+  {
+          number_sa_subarray =  (subarray.num_cols_fa_cam) / deg_bl_muxing;
+  }
+
+  int    num_dec_signals           = subarray.num_rows;
+  double C_ld_bit_mux_dec_out      = 0;
+  double C_ld_sa_mux_lev_1_dec_out = 0;
+  double C_ld_sa_mux_lev_2_dec_out = 0;
+  double R_wire_wl_drv_out;
+
+  if (!is_fa && !pure_cam)
+    {
+            R_wire_wl_drv_out = subarray.num_cols * cell.w * g_tp.wire_local.R_per_um;
+    }
+    else if (is_fa && !pure_cam)
+    {
+        R_wire_wl_drv_out = (subarray.num_cols_fa_cam * cam_cell.w + subarray.num_cols_fa_ram * cell.w) * g_tp.wire_local.R_per_um ;
+    }
+    else
+    {
+        R_wire_wl_drv_out = (subarray.num_cols_fa_cam * cam_cell.w ) * g_tp.wire_local.R_per_um;
+    }
+
+  double R_wire_bit_mux_dec_out = num_subarrays_per_row * subarray.num_cols * g_tp.wire_inside_mat.R_per_um * cell.w;//TODO:revisit for FA
+  double R_wire_sa_mux_dec_out  = num_subarrays_per_row * subarray.num_cols * g_tp.wire_inside_mat.R_per_um * cell.w;
+
+  if (deg_bl_muxing > 1)
+  {
+    C_ld_bit_mux_dec_out =
+      (2 * num_subarrays_per_mat * subarray.num_cols / deg_bl_muxing)*gate_C(g_tp.w_nmos_b_mux, 0, is_dram) +  // 2 transistor per cell
+      num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w();
+  }
+
+  if (dp.Ndsam_lev_1 > 1)
+  {
+    C_ld_sa_mux_lev_1_dec_out =
+      (num_subarrays_per_mat * number_sa_subarray / dp.Ndsam_lev_1)*gate_C(g_tp.w_nmos_sa_mux, 0, is_dram) +
+      num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w();
+  }
+  if (dp.Ndsam_lev_2 > 1)
+  {
+    C_ld_sa_mux_lev_2_dec_out =
+      (num_subarrays_per_mat * number_sa_subarray / (dp.Ndsam_lev_1*dp.Ndsam_lev_2))*gate_C(g_tp.w_nmos_sa_mux, 0, is_dram) +
+      num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w();
+  }
+
+  if (num_subarrays_per_row >= 2)
+  {
+    // wire heads for both right and left side of a mat, so half the resistance
+    R_wire_bit_mux_dec_out /= 2.0;
+    R_wire_sa_mux_dec_out  /= 2.0;
+  }
+
+
+  row_dec = new Decoder(
+      num_dec_signals,
+      false,
+      subarray.C_wl,
+      R_wire_wl_drv_out,
+      false/*is_fa*/,
+      is_dram,
+      true,
+      camFlag? cam_cell:cell);
+//  if (is_fa && (!dp.is_tag))
+//  {
+//    row_dec->exist = true;
+//  }
+  bit_mux_dec = new Decoder(
+      deg_bl_muxing,// This number is 1 for FA or CAM
+      false,
+      C_ld_bit_mux_dec_out,
+      R_wire_bit_mux_dec_out,
+      false/*is_fa*/,
+      is_dram,
+      false,
+      camFlag? cam_cell:cell);
+  sa_mux_lev_1_dec = new Decoder(
+      dp.deg_senseamp_muxing_non_associativity, // This number is 1 for FA or CAM
+      dp.number_way_select_signals_mat ? true : false,//only sa_mux_lev_1_dec needs way select signal
+      C_ld_sa_mux_lev_1_dec_out,
+      R_wire_sa_mux_dec_out,
+      false/*is_fa*/,
+      is_dram,
+      false,
+      camFlag? cam_cell:cell);
+  sa_mux_lev_2_dec = new Decoder(
+      dp.Ndsam_lev_2, // This number is 1 for FA or CAM
+      false,
+      C_ld_sa_mux_lev_2_dec_out,
+      R_wire_sa_mux_dec_out,
+      false/*is_fa*/,
+      is_dram,
+      false,
+      camFlag? cam_cell:cell);
+
+  double C_wire_predec_blk_out;
+  double R_wire_predec_blk_out;
+
+  if (!is_fa && !pure_cam)
+      {
+
+          C_wire_predec_blk_out  = num_subarrays_per_row * subarray.num_rows * g_tp.wire_inside_mat.C_per_um * cell.h;
+          R_wire_predec_blk_out  = num_subarrays_per_row * subarray.num_rows * g_tp.wire_inside_mat.R_per_um * cell.h;
+
+      }
+      else //for pre-decode block's load is same for both FA and CAM
+      {
+          C_wire_predec_blk_out  = subarray.num_rows * g_tp.wire_inside_mat.C_per_um * cam_cell.h;
+          R_wire_predec_blk_out  = subarray.num_rows * g_tp.wire_inside_mat.R_per_um * cam_cell.h;
+      }
+
+
+  if (is_fa||pure_cam)
+          num_dec_signals += _log2(num_subarrays_per_mat);
+
+  PredecBlk * r_predec_blk1 = new PredecBlk(
+      num_dec_signals,
+      row_dec,
+      C_wire_predec_blk_out,
+      R_wire_predec_blk_out,
+      num_subarrays_per_mat,
+      is_dram,
+      true);
+  PredecBlk * r_predec_blk2 = new PredecBlk(
+      num_dec_signals,
+      row_dec,
+      C_wire_predec_blk_out,
+      R_wire_predec_blk_out,
+      num_subarrays_per_mat,
+      is_dram,
+      false);
+  PredecBlk * b_mux_predec_blk1 = new PredecBlk(deg_bl_muxing, bit_mux_dec, 0, 0, 1, is_dram, true);
+  PredecBlk * b_mux_predec_blk2 = new PredecBlk(deg_bl_muxing, bit_mux_dec, 0, 0, 1, is_dram, false);
+  PredecBlk * sa_mux_lev_1_predec_blk1 = new PredecBlk(dyn_p.deg_senseamp_muxing_non_associativity, sa_mux_lev_1_dec, 0, 0, 1, is_dram, true);
+  PredecBlk * sa_mux_lev_1_predec_blk2 = new PredecBlk(dyn_p.deg_senseamp_muxing_non_associativity, sa_mux_lev_1_dec, 0, 0, 1, is_dram, false);
+  PredecBlk * sa_mux_lev_2_predec_blk1 = new PredecBlk(dp.Ndsam_lev_2, sa_mux_lev_2_dec, 0, 0, 1, is_dram, true);
+  PredecBlk * sa_mux_lev_2_predec_blk2 = new PredecBlk(dp.Ndsam_lev_2, sa_mux_lev_2_dec, 0, 0, 1, is_dram, false);
+  dummy_way_sel_predec_blk1 = new PredecBlk(1, sa_mux_lev_1_dec, 0, 0, 0, is_dram, true);
+  dummy_way_sel_predec_blk2 = new PredecBlk(1, sa_mux_lev_1_dec, 0, 0, 0, is_dram, false);
+
+  PredecBlkDrv * r_predec_blk_drv1 = new PredecBlkDrv(0, r_predec_blk1, is_dram);
+  PredecBlkDrv * r_predec_blk_drv2 = new PredecBlkDrv(0, r_predec_blk2, is_dram);
+  PredecBlkDrv * b_mux_predec_blk_drv1 = new PredecBlkDrv(0, b_mux_predec_blk1, is_dram);
+  PredecBlkDrv * b_mux_predec_blk_drv2 = new PredecBlkDrv(0, b_mux_predec_blk2, is_dram);
+  PredecBlkDrv * sa_mux_lev_1_predec_blk_drv1 = new PredecBlkDrv(0, sa_mux_lev_1_predec_blk1, is_dram);
+  PredecBlkDrv * sa_mux_lev_1_predec_blk_drv2 = new PredecBlkDrv(0, sa_mux_lev_1_predec_blk2, is_dram);
+  PredecBlkDrv * sa_mux_lev_2_predec_blk_drv1 = new PredecBlkDrv(0, sa_mux_lev_2_predec_blk1, is_dram);
+  PredecBlkDrv * sa_mux_lev_2_predec_blk_drv2 = new PredecBlkDrv(0, sa_mux_lev_2_predec_blk2, is_dram);
+  way_sel_drv1 = new PredecBlkDrv(dyn_p.number_way_select_signals_mat, dummy_way_sel_predec_blk1, is_dram);
+  dummy_way_sel_predec_blk_drv2 = new PredecBlkDrv(1, dummy_way_sel_predec_blk2, is_dram);
+
+  r_predec            = new Predec(r_predec_blk_drv1, r_predec_blk_drv2);
+  b_mux_predec        = new Predec(b_mux_predec_blk_drv1, b_mux_predec_blk_drv2);
+  sa_mux_lev_1_predec = new Predec(sa_mux_lev_1_predec_blk_drv1, sa_mux_lev_1_predec_blk_drv2);
+  sa_mux_lev_2_predec = new Predec(sa_mux_lev_2_predec_blk_drv1, sa_mux_lev_2_predec_blk_drv2);
+
+  subarray_out_wire   = new Wire(g_ip->wt, subarray.area.h);//Bug should be subarray.area.w Owen and Sheng
+
+  double driver_c_gate_load;
+  double driver_c_wire_load;
+  double driver_r_wire_load;
+
+  if (is_fa || pure_cam)
+
+  {   //Although CAM and RAM use different bl pre-charge driver, assuming the precharge p size is the same
+          driver_c_gate_load =  (subarray.num_cols_fa_cam )* gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+          driver_c_wire_load =  subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.C_per_um;
+          driver_r_wire_load =  subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.R_per_um;
+          cam_bl_precharge_eq_drv = new Driver(
+                          driver_c_gate_load,
+                          driver_c_wire_load,
+                          driver_r_wire_load,
+                          is_dram);
+
+          if (!pure_cam)
+          {
+                  //This is only used for fully asso not pure CAM
+                  driver_c_gate_load =  (subarray.num_cols_fa_ram )* gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+                  driver_c_wire_load =  subarray.num_cols_fa_ram * cell.w * g_tp.wire_outside_mat.C_per_um;
+                  driver_r_wire_load =  subarray.num_cols_fa_ram * cell.w * g_tp.wire_outside_mat.R_per_um;
+                  bl_precharge_eq_drv = new Driver(
+                                  driver_c_gate_load,
+                                  driver_c_wire_load,
+                                  driver_r_wire_load,
+                                  is_dram);
+          }
+  }
+
+  else
+  {
+          driver_c_gate_load =  subarray.num_cols * gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+          driver_c_wire_load =  subarray.num_cols * cell.w * g_tp.wire_outside_mat.C_per_um;
+          driver_r_wire_load =  subarray.num_cols * cell.w * g_tp.wire_outside_mat.R_per_um;
+          bl_precharge_eq_drv = new Driver(
+                          driver_c_gate_load,
+                          driver_c_wire_load,
+                          driver_r_wire_load,
+                          is_dram);
+  }
+  double area_row_decoder = row_dec->area.get_area() * subarray.num_rows * (RWP + ERP + EWP);
+  double w_row_decoder    = area_row_decoder / subarray.area.get_h();
+
+  double h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux =
+    compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
+
+  double h_subarray_out_drv = subarray_out_wire->area.get_area() *
+    (subarray.num_cols / (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2)) / subarray.area.get_w();
+
+
+  h_subarray_out_drv *= (RWP + ERP + SCHP);
+
+  double h_comparators                = 0.0;
+  double w_row_predecode_output_wires = 0.0;
+  double h_bit_mux_dec_out_wires      = 0.0;
+  double h_senseamp_mux_dec_out_wires = 0.0;
+
+  if ((!is_fa)&&(dp.is_tag))
+  {
+    //tagbits = (4 * num_cols_subarray / (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2)) / num_do_b_mat;
+    h_comparators  = compute_comparators_height(dp.tagbits, dyn_p.num_do_b_mat, subarray.area.get_w());
+    h_comparators *= (RWP + ERP);
+  }
+
+
+    int branch_effort_predec_blk1_out = (1 << r_predec_blk2->number_input_addr_bits);
+    int branch_effort_predec_blk2_out = (1 << r_predec_blk1->number_input_addr_bits);
+    w_row_predecode_output_wires   = (branch_effort_predec_blk1_out + branch_effort_predec_blk2_out) *
+      g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP);
+
+
+  double h_non_cell_area = (num_subarrays_per_mat / num_subarrays_per_row) *
+                           (h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux +
+                            h_subarray_out_drv + h_comparators);
+
+  double w_non_cell_area = MAX(w_row_predecode_output_wires, num_subarrays_per_row * w_row_decoder);
+
+  if (deg_bl_muxing > 1)
+  {
+    h_bit_mux_dec_out_wires = deg_bl_muxing * g_tp.wire_inside_mat.pitch * (RWP + ERP);
+  }
+  if (dp.Ndsam_lev_1 > 1)
+  {
+    h_senseamp_mux_dec_out_wires =  dp.Ndsam_lev_1 * g_tp.wire_inside_mat.pitch * (RWP + ERP);
+  }
+  if (dp.Ndsam_lev_2 > 1)
+  {
+    h_senseamp_mux_dec_out_wires += dp.Ndsam_lev_2 * g_tp.wire_inside_mat.pitch * (RWP + ERP);
+  }
+
+  double h_addr_datain_wires;
+  if (!g_ip->ver_htree_wires_over_array)
+  {
+    h_addr_datain_wires = (dp.number_addr_bits_mat + dp.number_way_select_signals_mat +
+                                  (dp.num_di_b_mat + dp.num_do_b_mat)/num_subarrays_per_row) *
+                                 g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP);
+
+    if (is_fa || pure_cam)
+    {
+        h_addr_datain_wires = (dp.number_addr_bits_mat + dp.number_way_select_signals_mat +     //TODO: revisit
+                                      (dp.num_di_b_mat+ dp.num_do_b_mat )/num_subarrays_per_row) *
+                                       g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP) +
+                                       (dp.num_si_b_mat + dp.num_so_b_mat )/num_subarrays_per_row * g_tp.wire_inside_mat.pitch * SCHP;
+    }
+    //h_non_cell_area = 2 * h_bit_mux_sense_amp_precharge_sa_mux +
+    //MAX(h_addr_datain_wires, 2 * h_subarray_out_drv);
+    h_non_cell_area = (h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux + h_comparators +
+                       h_subarray_out_drv) * (num_subarrays_per_mat / num_subarrays_per_row) +
+                      h_addr_datain_wires +
+                      h_bit_mux_dec_out_wires +
+                      h_senseamp_mux_dec_out_wires;
+
+  }
+
+  // double area_rectangle_center_mat = h_non_cell_area * w_non_cell_area;
+  double area_mat_center_circuitry = (r_predec_blk_drv1->area.get_area() +
+                                      b_mux_predec_blk_drv1->area.get_area() +
+                                      sa_mux_lev_1_predec_blk_drv1->area.get_area() +
+                                      sa_mux_lev_2_predec_blk_drv1->area.get_area() +
+                                      way_sel_drv1->area.get_area() +
+                                      r_predec_blk_drv2->area.get_area() +
+                                      b_mux_predec_blk_drv2->area.get_area() +
+                                      sa_mux_lev_1_predec_blk_drv2->area.get_area() +
+                                      sa_mux_lev_2_predec_blk_drv2->area.get_area() +
+                                      r_predec_blk1->area.get_area() +
+                                      b_mux_predec_blk1->area.get_area() +
+                                      sa_mux_lev_1_predec_blk1->area.get_area() +
+                                      sa_mux_lev_2_predec_blk1->area.get_area() +
+                                      r_predec_blk2->area.get_area() +
+                                      b_mux_predec_blk2->area.get_area() +
+                                      sa_mux_lev_1_predec_blk2->area.get_area() +
+                                      sa_mux_lev_2_predec_blk2->area.get_area() +
+                                      bit_mux_dec->area.get_area() +
+                                      sa_mux_lev_1_dec->area.get_area() +
+                                      sa_mux_lev_2_dec->area.get_area()) * (RWP + ERP + EWP);
+
+  double area_efficiency_mat;
+
+//  if (!is_fa)
+//  {
+    assert(num_subarrays_per_mat/num_subarrays_per_row>0);
+    area.h = (num_subarrays_per_mat/num_subarrays_per_row)* subarray.area.h + h_non_cell_area;
+    area.w = num_subarrays_per_row * subarray.area.get_w() + w_non_cell_area;
+    area.w = (area.h*area.w + area_mat_center_circuitry) / area.h;
+    area_efficiency_mat = subarray.area.get_area() * num_subarrays_per_mat * 100.0 / area.get_area();
+
+//    cout<<"h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux"<<h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux<<endl;
+//    cout<<"h_comparators"<<h_comparators<<endl;
+//    cout<<"h_subarray_out_drv"<<h_subarray_out_drv<<endl;
+//    cout<<"h_addr_datain_wires"<<h_addr_datain_wires<<endl;
+//    cout<<"h_bit_mux_dec_out_wires"<<h_bit_mux_dec_out_wires<<endl;
+//    cout<<"h_senseamp_mux_dec_out_wires"<<h_senseamp_mux_dec_out_wires<<endl;
+//    cout<<"h_non_cell_area"<<h_non_cell_area<<endl;
+//    cout<<"area.h =" << (num_subarrays_per_mat/num_subarrays_per_row)* subarray.area.h<<endl;
+//    cout<<"w_non_cell_area"<<w_non_cell_area<<endl;
+//    cout<<"area_mat_center_circuitry"<<area_mat_center_circuitry<<endl;
+
+    assert(area.h>0);
+    assert(area.w>0);
+//  }
+//  else
+//  {
+//    area.h = (num_subarrays_per_mat / num_subarrays_per_row) * subarray.area.get_h() + h_non_cell_area;
+//    area.w = num_subarrays_per_row * subarray.area.get_w() + w_non_cell_area;
+//    area.w = (area.h*area.w + area_mat_center_circuitry) / area.h;
+//    area_efficiency_mat = subarray.area.get_area() * num_subarrays_per_row * 100.0 / area.get_area();
+//  }
+  }
+
+
+
+Mat::~Mat()
+{
+  delete row_dec;
+  delete bit_mux_dec;
+  delete sa_mux_lev_1_dec;
+  delete sa_mux_lev_2_dec;
+
+  delete r_predec->blk1;
+  delete r_predec->blk2;
+  delete b_mux_predec->blk1;
+  delete b_mux_predec->blk2;
+  delete sa_mux_lev_1_predec->blk1;
+  delete sa_mux_lev_1_predec->blk2;
+  delete sa_mux_lev_2_predec->blk1;
+  delete sa_mux_lev_2_predec->blk2;
+  delete dummy_way_sel_predec_blk1;
+  delete dummy_way_sel_predec_blk2;
+
+  delete r_predec->drv1;
+  delete r_predec->drv2;
+  delete b_mux_predec->drv1;
+  delete b_mux_predec->drv2;
+  delete sa_mux_lev_1_predec->drv1;
+  delete sa_mux_lev_1_predec->drv2;
+  delete sa_mux_lev_2_predec->drv1;
+  delete sa_mux_lev_2_predec->drv2;
+  delete way_sel_drv1;
+  delete dummy_way_sel_predec_blk_drv2;
+
+  delete r_predec;
+  delete b_mux_predec;
+  delete sa_mux_lev_1_predec;
+  delete sa_mux_lev_2_predec;
+
+  delete subarray_out_wire;
+  if (!pure_cam)
+    delete bl_precharge_eq_drv;
+
+  if (is_fa || pure_cam)
+  {
+    delete sl_precharge_eq_drv ;
+    delete sl_data_drv ;
+    delete cam_bl_precharge_eq_drv;
+    delete ml_precharge_drv;
+    delete ml_to_ram_wl_drv;
+  }
+}
+
+
+
+double Mat::compute_delays(double inrisetime)
+{
+        int k;
+        double rd, C_intrinsic, C_ld, tf, R_bl_precharge,r_b_metal, R_bl, C_bl;
+        double outrisetime_search, outrisetime, row_dec_outrisetime;
+        // delay calculation for tags of fully associative cache
+        if (is_fa || pure_cam)
+        {
+                //Compute search access time
+                outrisetime_search = compute_cam_delay(inrisetime);
+                if (is_fa)
+                {
+                        bl_precharge_eq_drv->compute_delay(0);
+                        k = ml_to_ram_wl_drv->number_gates - 1;
+                        rd = tr_R_on(ml_to_ram_wl_drv->width_n[k], NCH, 1, is_dram, false, true);
+                        C_intrinsic = drain_C_(ml_to_ram_wl_drv->width_n[k], PCH, 1, 1, 4*cell.h, is_dram, false, true) +
+                        drain_C_(ml_to_ram_wl_drv->width_n[k], NCH, 1, 1, 4*cell.h, is_dram, false, true);
+                        C_ld = ml_to_ram_wl_drv->c_gate_load+ ml_to_ram_wl_drv->c_wire_load;
+                        tf = rd * (C_intrinsic + C_ld) + ml_to_ram_wl_drv->r_wire_load * C_ld / 2;
+                        delay_wl_reset = horowitz(0, tf, 0.5, 0.5, RISE);
+
+                        R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);
+                        r_b_metal = cam_cell.h * g_tp.wire_local.R_per_um;//dummy rows in sram are filled in
+                        R_bl = subarray.num_rows * r_b_metal;
+                        C_bl = subarray.C_bl;
+                        delay_bl_restore = bl_precharge_eq_drv->delay +
+                                 log((g_tp.sram.Vbitpre - 0.1 * dp.V_b_sense) / (g_tp.sram.Vbitpre - dp.V_b_sense))*
+                                 (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+
+
+                        outrisetime_search = compute_bitline_delay(outrisetime_search);
+                        outrisetime_search = compute_sa_delay(outrisetime_search);
+                }
+                        outrisetime_search = compute_subarray_out_drv(outrisetime_search);
+                        subarray_out_wire->set_in_rise_time(outrisetime_search);
+                        outrisetime_search = subarray_out_wire->signal_rise_time();
+                        delay_subarray_out_drv_htree = delay_subarray_out_drv + subarray_out_wire->delay;
+
+
+                        //TODO: this is just for compute plain read/write energy for fa and cam, plain read/write access timing need to be revisited.
+                        outrisetime = r_predec->compute_delays(inrisetime);
+                        row_dec_outrisetime = row_dec->compute_delays(outrisetime);
+
+                        outrisetime = b_mux_predec->compute_delays(inrisetime);
+                        bit_mux_dec->compute_delays(outrisetime);
+
+                        outrisetime = sa_mux_lev_1_predec->compute_delays(inrisetime);
+                        sa_mux_lev_1_dec->compute_delays(outrisetime);
+
+                        outrisetime = sa_mux_lev_2_predec->compute_delays(inrisetime);
+                        sa_mux_lev_2_dec->compute_delays(outrisetime);
+
+                        if (pure_cam)
+                        {
+                          outrisetime = compute_bitline_delay(row_dec_outrisetime);
+                          outrisetime = compute_sa_delay(outrisetime);
+                        }
+                        return outrisetime_search;
+    }
+        else
+        {
+                bl_precharge_eq_drv->compute_delay(0);
+                if (row_dec->exist == true)
+                {
+                        int k = row_dec->num_gates - 1;
+                        double rd = tr_R_on(row_dec->w_dec_n[k], NCH, 1, is_dram, false, true);
+                        // TODO: this 4*cell.h number must be revisited
+                        double C_intrinsic = drain_C_(row_dec->w_dec_p[k], PCH, 1, 1, 4*cell.h, is_dram, false, true) +
+                        drain_C_(row_dec->w_dec_n[k], NCH, 1, 1, 4*cell.h, is_dram, false, true);
+                        double C_ld = row_dec->C_ld_dec_out;
+                        double tf = rd * (C_intrinsic + C_ld) + row_dec->R_wire_dec_out * C_ld / 2;
+                        delay_wl_reset = horowitz(0, tf, 0.5, 0.5, RISE);
+                }
+                double R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);
+                double r_b_metal = cell.h * g_tp.wire_local.R_per_um;
+                double R_bl = subarray.num_rows * r_b_metal;
+                double C_bl = subarray.C_bl;
+
+                if (is_dram)
+                {
+                        delay_bl_restore = bl_precharge_eq_drv->delay + 2.3 * (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+                }
+                else
+                {
+                        delay_bl_restore = bl_precharge_eq_drv->delay +
+                        log((g_tp.sram.Vbitpre - 0.1 * dp.V_b_sense) / (g_tp.sram.Vbitpre - dp.V_b_sense))*
+                        (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+                }
+  }
+
+
+
+  outrisetime = r_predec->compute_delays(inrisetime);
+  row_dec_outrisetime = row_dec->compute_delays(outrisetime);
+
+  outrisetime = b_mux_predec->compute_delays(inrisetime);
+  bit_mux_dec->compute_delays(outrisetime);
+
+  outrisetime = sa_mux_lev_1_predec->compute_delays(inrisetime);
+  sa_mux_lev_1_dec->compute_delays(outrisetime);
+
+  outrisetime = sa_mux_lev_2_predec->compute_delays(inrisetime);
+  sa_mux_lev_2_dec->compute_delays(outrisetime);
+
+  outrisetime = compute_bitline_delay(row_dec_outrisetime);
+  outrisetime = compute_sa_delay(outrisetime);
+  outrisetime = compute_subarray_out_drv(outrisetime);
+  subarray_out_wire->set_in_rise_time(outrisetime);
+  outrisetime = subarray_out_wire->signal_rise_time();
+
+  delay_subarray_out_drv_htree = delay_subarray_out_drv + subarray_out_wire->delay;
+
+  if (dp.is_tag == true && dp.fully_assoc == false)
+  {
+    compute_comparator_delay(0);
+  }
+
+  if (row_dec->exist == false)
+    {
+      delay_wl_reset = MAX(r_predec->blk1->delay, r_predec->blk2->delay);
+    }
+  return outrisetime;
+}
+
+
+
+double Mat::compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h()
+{
+
+  double height = compute_tr_width_after_folding(g_tp.w_pmos_bl_precharge, camFlag? cam_cell.w:cell.w / (2 *(RWP + ERP + SCHP))) +
+    compute_tr_width_after_folding(g_tp.w_pmos_bl_eq, camFlag? cam_cell.w:cell.w / (RWP + ERP + SCHP));  // precharge circuitry
+
+  if (deg_bl_muxing > 1)
+  {
+    height += compute_tr_width_after_folding(g_tp.w_nmos_b_mux, cell.w / (2 *(RWP + ERP)));  // col mux tr height
+    // height += deg_bl_muxing * g_tp.wire_inside_mat.pitch * (RWP + ERP);  // bit mux dec out wires height
+  }
+
+  height += height_sense_amplifier(/*camFlag? sram_cell.w:*/cell.w * deg_bl_muxing / (RWP + ERP));  // sense_amp_height
+
+  if (dp.Ndsam_lev_1 > 1)
+  {
+    height += compute_tr_width_after_folding(
+        g_tp.w_nmos_sa_mux, cell.w * dp.Ndsam_lev_1 / (RWP + ERP));  // sense_amp_mux_height
+    //height_senseamp_mux_decode_output_wires =  Ndsam * wire_inside_mat_pitch * (RWP + ERP);
+  }
+
+  if (dp.Ndsam_lev_2 > 1)
+  {
+    height += compute_tr_width_after_folding(
+        g_tp.w_nmos_sa_mux, cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP));  // sense_amp_mux_height
+    //height_senseamp_mux_decode_output_wires =  Ndsam * wire_inside_mat_pitch * (RWP + ERP);
+
+    // add height of inverter-buffers between the two levels (pass-transistors) of sense-amp mux
+    height += 2 * compute_tr_width_after_folding(
+        pmos_to_nmos_sz_ratio(is_dram) * g_tp.min_w_nmos_, cell.w * dp.Ndsam_lev_2 / (RWP + ERP));
+    height += 2 * compute_tr_width_after_folding(g_tp.min_w_nmos_, cell.w * dp.Ndsam_lev_2 / (RWP + ERP));
+  }
+
+  // TODO: this should be uncommented...
+  /*if (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2 > 1)
+    {
+  //height_write_mux_decode_output_wires = deg_bl_muxing * Ndsam * g_tp.wire_inside_mat.pitch * (RWP + EWP);
+  double width_write_driver_write_mux  = width_write_driver_or_write_mux();
+  double height_write_driver_write_mux = compute_tr_width_after_folding(2 * width_write_driver_write_mux,
+  cell.w *
+  // deg_bl_muxing *
+  dp.Ndsam_lev_1 * dp.Ndsam_lev_2 / (RWP + EWP));
+  height += height_write_driver_write_mux;
+  }*/
+
+  return height;
+}
+
+
+
+double Mat::compute_cam_delay(double inrisetime)
+{
+
+  double out_time_ramp, this_delay;
+  double Rwire, tf, c_intrinsic, rd, Cwire, c_gate_load;
+
+
+  double Wdecdrivep, Wdecdriven, Wfadriven, Wfadrivep, Wfadrive2n, Wfadrive2p, Wfadecdrive1n, Wfadecdrive1p,
+    Wfadecdrive2n, Wfadecdrive2p, Wfadecdriven, Wfadecdrivep, Wfaprechn, Wfaprechp,
+    Wdummyn, Wdummyinvn, Wdummyinvp, Wfainvn, Wfainvp, Waddrnandn, Waddrnandp,
+    Wfanandn, Wfanandp, Wfanorn, Wfanorp, Wdecnandn, Wdecnandp, W_hit_miss_n, W_hit_miss_p;
+
+  double c_matchline_metal, r_matchline_metal, c_searchline_metal, r_searchline_metal,  dynSearchEng;
+  int Htagbits;
+
+  double driver_c_gate_load;
+  double driver_c_wire_load;
+  double driver_r_wire_load;
+  //double searchline_precharge_time;
+
+  double leak_power_cc_inverters_sram_cell         = 0;
+  double leak_power_acc_tr_RW_or_WR_port_sram_cell = 0;
+  double leak_power_RD_port_sram_cell              = 0;
+  double leak_power_SCHP_port_sram_cell            = 0;
+  double leak_comparator_cam_cell                  =0;
+
+  double gate_leak_comparator_cam_cell          = 0;
+  double gate_leak_power_cc_inverters_sram_cell = 0;
+  double gate_leak_power_RD_port_sram_cell      = 0;
+  double gate_leak_power_SCHP_port_sram_cell    = 0;
+
+  c_matchline_metal   = cam_cell.get_w() * g_tp.wire_local.C_per_um;
+  c_searchline_metal  = cam_cell.get_h() * g_tp.wire_local.C_per_um;
+  r_matchline_metal   = cam_cell.get_w() * g_tp.wire_local.R_per_um;
+  r_searchline_metal  = cam_cell.get_h() * g_tp.wire_local.R_per_um;
+
+  dynSearchEng = 0.0;
+  delay_matchchline = 0.0;
+  double p_to_n_sizing_r = pmos_to_nmos_sz_ratio(is_dram);
+  bool linear_scaling = false;
+
+  if (linear_scaling)
+  {
+          Wdecdrivep    =  450 * g_ip->F_sz_um;//this was 360 micron for the 0.8 micron process
+          Wdecdriven    =  300 * g_ip->F_sz_um;//this was 240 micron for the 0.8 micron process
+          Wfadriven     = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Wfadrivep     =  125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process
+          Wfadrive2n    =  250 * g_ip->F_sz_um;//this was 200 micron for the 0.8 micron process
+          Wfadrive2p    =  500 * g_ip->F_sz_um;//this was 400 micron for the 0.8 micron process
+          Wfadecdrive1n = 6.25 * g_ip->F_sz_um;//this was   5 micron for the 0.8 micron process
+          Wfadecdrive1p = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wfadecdrive2n =   25 * g_ip->F_sz_um;//this was  20 micron for the 0.8 micron process
+          Wfadecdrive2p =   50 * g_ip->F_sz_um;//this was  40 micron for the 0.8 micron process
+          Wfadecdriven  = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Wfadecdrivep  =  125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process
+          Wfaprechn     =  7.5 * g_ip->F_sz_um;//this was   6 micron for the 0.8 micron process
+          Wfainvn       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wfainvp       =   25 * g_ip->F_sz_um;//this was  20 micron for the 0.8 micron process
+          Wfanandn      =   25 * g_ip->F_sz_um;//this was  20 micron for the 0.8 micron process
+          Wfanandp      = 37.5 * g_ip->F_sz_um;//this was  30 micron for the 0.8 micron process
+          Wdecnandn     = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wdecnandp     = 37.5 * g_ip->F_sz_um;//this was  30 micron for the 0.8 micron process
+
+          Wfaprechp     = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wdummyn       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wdummyinvn    =   75 * g_ip->F_sz_um;//this was  60 micron for the 0.8 micron process
+          Wdummyinvp    =  100 * g_ip->F_sz_um;//this was  80 micron for the 0.8 micron process
+          Waddrnandn    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Waddrnandp    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Wfanorn       = 6.25 * g_ip->F_sz_um;//this was   5 micron for the 0.8 micron process
+          Wfanorp       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          W_hit_miss_n    = Wdummyn;
+          W_hit_miss_p    = g_tp.min_w_nmos_*p_to_n_sizing_r;
+          //TODO: this number should updated using new layout; from the NAND to output NOR should be computed using logical effort
+  }
+  else
+  {
+          Wdecdrivep    =  450 * g_ip->F_sz_um;//this was 360 micron for the 0.8 micron process
+          Wdecdriven    =  300 * g_ip->F_sz_um;//this was 240 micron for the 0.8 micron process
+          Wfadriven     = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Wfadrivep     =  125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process
+          Wfadrive2n    =  250 * g_ip->F_sz_um;//this was 200 micron for the 0.8 micron process
+          Wfadrive2p    =  500 * g_ip->F_sz_um;//this was 400 micron for the 0.8 micron process
+          Wfadecdrive1n = 6.25 * g_ip->F_sz_um;//this was   5 micron for the 0.8 micron process
+          Wfadecdrive1p = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wfadecdrive2n =   25 * g_ip->F_sz_um;//this was  20 micron for the 0.8 micron process
+          Wfadecdrive2p =   50 * g_ip->F_sz_um;//this was  40 micron for the 0.8 micron process
+          Wfadecdriven  = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Wfadecdrivep  =  125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process
+          Wfaprechn     =  7.5 * g_ip->F_sz_um;//this was   6 micron for the 0.8 micron process
+          Wfainvn       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wfainvp       =   25 * g_ip->F_sz_um;//this was  20 micron for the 0.8 micron process
+          Wfanandn      =   25 * g_ip->F_sz_um;//this was  20 micron for the 0.8 micron process
+          Wfanandp      = 37.5 * g_ip->F_sz_um;//this was  30 micron for the 0.8 micron process
+          Wdecnandn     = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          Wdecnandp     = 37.5 * g_ip->F_sz_um;//this was  30 micron for the 0.8 micron process
+
+          Wfaprechp     = g_tp.w_pmos_bl_precharge;//this was  10 micron for the 0.8 micron process
+          Wdummyn       = g_tp.cam.cell_nmos_w;
+          Wdummyinvn    =   75 * g_ip->F_sz_um;//this was  60 micron for the 0.8 micron process
+          Wdummyinvp    =  100 * g_ip->F_sz_um;//this was  80 micron for the 0.8 micron process
+          Waddrnandn    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Waddrnandp    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+          Wfanorn       = 6.25 * g_ip->F_sz_um;//this was   5 micron for the 0.8 micron process
+          Wfanorp       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+          W_hit_miss_n    = Wdummyn;
+          W_hit_miss_p    = g_tp.min_w_nmos_*p_to_n_sizing_r;
+  }
+
+  Htagbits = (int)(ceil ((double) (subarray.num_cols_fa_cam) / 2.0));
+
+  /* First stage, searchline is precharged. searchline data driver drives the searchline to open (if miss) the comparators.
+     search_line_delay, search_line_power, search_line_restore_delay for cycle time computation.
+     From the driver(am and an) to the comparators in all the rows including the dummy row,
+     Assuming that comparators in both the normal matching line and the dummy matching line have the same sizing */
+
+  //Searchline precharge circuitry is same as that of bitline. However, no sharing between search ports and r/w ports
+  //Searchline precharge routes horizontally
+  driver_c_gate_load = subarray.num_cols_fa_cam * gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+  driver_c_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.C_per_um;
+  driver_r_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.R_per_um;
+
+  sl_precharge_eq_drv = new Driver(
+      driver_c_gate_load,
+          driver_c_wire_load,
+      driver_r_wire_load,
+      is_dram);
+
+  //searchline data driver ; subarray.num_rows + 1 is because of the dummy row
+  //data drv should only have gate_C not 2*gate_C since the two searchlines are differential--same as bitlines
+  driver_c_gate_load = (subarray.num_rows + 1) * gate_C(Wdummyn, 0, is_dram, false, false);
+  driver_c_wire_load = (subarray.num_rows + 1) * c_searchline_metal;
+  driver_r_wire_load = (subarray.num_rows + 1) * r_searchline_metal;
+  sl_data_drv = new Driver(
+      driver_c_gate_load,
+          driver_c_wire_load,
+      driver_r_wire_load,
+      is_dram);
+
+  sl_precharge_eq_drv->compute_delay(0);
+  double R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);//Assuming CAM and SRAM have same Pre_eq_dr
+  double r_b_metal = cam_cell.h * g_tp.wire_local.R_per_um;
+  double R_bl = (subarray.num_rows + 1) * r_b_metal;
+  double C_bl = subarray.C_bl_cam;
+  delay_cam_sl_restore = sl_precharge_eq_drv->delay
+                         + log(g_tp.cam.Vbitpre)* (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+
+  out_time_ramp = sl_data_drv->compute_delay(inrisetime);//After entering one mat, start to consider the inrisetime from 0(0 is passed from outside)
+
+  //matchline ops delay
+  delay_matchchline += sl_data_drv->delay;
+
+  /* second stage, from the trasistors in the comparators(both normal row and dummy row) to the NAND gates that combins both half*/
+  //matchline delay, matchline power, matchline_reset for cycle time computation,
+
+  ////matchline precharge circuitry routes vertically
+  //There are two matchline precharge driver chains per subarray.
+  driver_c_gate_load = (subarray.num_rows + 1) * gate_C(Wfaprechp, 0, is_dram);
+  driver_c_wire_load = (subarray.num_rows + 1) * c_searchline_metal;
+  driver_r_wire_load = (subarray.num_rows + 1) * r_searchline_metal;
+
+  ml_precharge_drv = new Driver(
+                                                  driver_c_gate_load,
+                              driver_c_wire_load,
+                          driver_r_wire_load,
+                          is_dram);
+
+  ml_precharge_drv->compute_delay(0);
+
+
+  rd =  tr_R_on(Wdummyn, NCH, 2, is_dram);
+  c_intrinsic = Htagbits*(2*drain_C_(Wdummyn, NCH, 2, 1, g_tp.cell_h_def, is_dram)//TODO: the cell_h_def should be revisit
+                                  + drain_C_(Wfaprechp, PCH, 1, 1, g_tp.cell_h_def, is_dram)/Htagbits);//since each halve only has one precharge tx per matchline
+
+  Cwire = c_matchline_metal * Htagbits;
+  Rwire = r_matchline_metal * Htagbits;
+  c_gate_load = gate_C(Waddrnandn + Waddrnandp, 0, is_dram);
+
+  double R_ml_precharge = tr_R_on(Wfaprechp, PCH, 1, is_dram);
+  //double r_ml_metal = cam_cell.w * g_tp.wire_local.R_per_um;
+  double R_ml = Rwire;
+  double C_ml = Cwire + c_intrinsic;
+  delay_cam_ml_reset = ml_precharge_drv->delay
+                           + log(g_tp.cam.Vbitpre)* (R_ml_precharge * C_ml + R_ml * C_ml / 2);//TODO: latest CAM has sense amps on matchlines too
+
+  //matchline ops delay
+  tf = rd * (c_intrinsic + Cwire / 2 + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load);
+  this_delay = horowitz(out_time_ramp, tf, VTHFA2, VTHFA3, FALL);
+  delay_matchchline += this_delay;
+  out_time_ramp = this_delay / VTHFA3;
+
+  dynSearchEng += ((c_intrinsic + Cwire + c_gate_load)*(subarray.num_rows +1)) //+ 2*drain_C_(Wdummyn, NCH, 2, 1, g_tp.cell_h_def, is_dram))//TODO: need to be precise
+                                          * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *2;//* Ntbl;//each subarry has two halves
+
+  /* third stage, from the NAND2 gates to the drivers in the dummy row */
+  rd = tr_R_on(Waddrnandn, NCH, 2, is_dram);
+  c_intrinsic = drain_C_(Waddrnandn, NCH, 2, 1, g_tp.cell_h_def, is_dram) +
+                drain_C_(Waddrnandp, PCH, 1, 1, g_tp.cell_h_def, is_dram)*2;
+  c_gate_load = gate_C(Wdummyinvn + Wdummyinvp, 0, is_dram);
+  tf = rd * (c_intrinsic + c_gate_load);
+  this_delay = horowitz(out_time_ramp, tf, VTHFA3, VTHFA4, RISE);
+  out_time_ramp = this_delay / (1 - VTHFA4);
+  delay_matchchline += this_delay;
+
+  //only the dummy row has the extra inverter between NAND and NOR gates
+  dynSearchEng += (c_intrinsic* (subarray.num_rows+1)+ c_gate_load*2) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//  * Ntbl;
+
+  /* fourth stage, from the driver in dummy matchline to the NOR2 gate which drives the wordline of the data portion */
+  rd = tr_R_on(Wdummyinvn, NCH, 1, is_dram);
+  c_intrinsic = drain_C_(Wdummyinvn, NCH, 1, 1, g_tp.cell_h_def, is_dram) + drain_C_(Wdummyinvp, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  Cwire = c_matchline_metal * Htagbits +  c_searchline_metal * (subarray.num_rows+1)/2;
+  Rwire = r_matchline_metal * Htagbits +  r_searchline_metal * (subarray.num_rows+1)/2;
+  c_gate_load = gate_C(Wfanorn + Wfanorp, 0, is_dram);
+  tf = rd * (c_intrinsic + Cwire + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load);
+  this_delay = horowitz (out_time_ramp, tf, VTHFA4, VTHFA5, FALL);
+  out_time_ramp = this_delay / VTHFA5;
+  delay_matchchline += this_delay;
+
+  dynSearchEng += (c_intrinsic + Cwire + subarray.num_rows*c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//* Ntbl;
+
+  /*final statge from the NOR gate to drive the wordline of the data portion */
+
+  //searchline data driver There are two matchline precharge driver chains per subarray.
+  driver_c_gate_load = gate_C(W_hit_miss_n, 0, is_dram, false, false);//nmos of the pull down logic
+  driver_c_wire_load = subarray.C_wl_ram;
+  driver_r_wire_load = subarray.R_wl_ram;
+
+  ml_to_ram_wl_drv = new Driver(
+                                                  driver_c_gate_load,
+                              driver_c_wire_load,
+                          driver_r_wire_load,
+                          is_dram);
+
+
+
+  rd = tr_R_on(Wfanorn, NCH, 1, is_dram);
+  c_intrinsic = 2* drain_C_(Wfanorn, NCH, 1, 1, g_tp.cell_h_def, is_dram) + drain_C_(Wfanorp, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  c_gate_load = gate_C(ml_to_ram_wl_drv->width_n[0] + ml_to_ram_wl_drv->width_p[0], 0, is_dram);
+  tf = rd * (c_intrinsic + c_gate_load);
+  this_delay = horowitz (out_time_ramp, tf, 0.5, 0.5, RISE);
+  out_time_ramp = this_delay / (1-0.5);
+  delay_matchchline += this_delay;
+
+  out_time_ramp   = ml_to_ram_wl_drv->compute_delay(out_time_ramp);
+
+  //c_gate_load energy is computed in ml_to_ram_wl_drv
+  dynSearchEng  += (c_intrinsic) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//* Ntbl;
+
+
+  /* peripheral-- hitting logic "CMOS VLSI Design Fig11.51*/
+  /*Precharge the hitting logic */
+  c_intrinsic = 2*drain_C_(W_hit_miss_p, NCH, 2, 1, g_tp.cell_h_def, is_dram);
+  Cwire = c_searchline_metal * subarray.num_rows;
+  Rwire = r_searchline_metal * subarray.num_rows;
+  c_gate_load = drain_C_(W_hit_miss_n, NCH, 1, 1, g_tp.cell_h_def, is_dram)* subarray.num_rows;
+
+  rd = tr_R_on(W_hit_miss_p, PCH, 1, is_dram, false, false);
+  //double r_ml_metal = cam_cell.w * g_tp.wire_local.R_per_um;
+  double R_hit_miss = Rwire;
+  double C_hit_miss = Cwire + c_intrinsic;
+  delay_hit_miss_reset = log(g_tp.cam.Vbitpre)* (rd * C_hit_miss + R_hit_miss * C_hit_miss / 2);
+  dynSearchEng  += (c_intrinsic + Cwire + c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+
+  /*hitting logic evaluation */
+  c_intrinsic = 2*drain_C_(W_hit_miss_n, NCH, 2, 1, g_tp.cell_h_def, is_dram);
+  Cwire = c_searchline_metal * subarray.num_rows;
+  Rwire = r_searchline_metal * subarray.num_rows;
+  c_gate_load = drain_C_(W_hit_miss_n, NCH, 1, 1, g_tp.cell_h_def, is_dram)* subarray.num_rows;
+
+  rd = tr_R_on(W_hit_miss_n, PCH, 1, is_dram, false, false);
+  tf = rd * (c_intrinsic + Cwire / 2 + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load);
+
+  delay_hit_miss = horowitz(0, tf, 0.5, 0.5, FALL);
+
+  if (is_fa)
+      delay_matchchline += MAX(ml_to_ram_wl_drv->delay, delay_hit_miss);
+
+  dynSearchEng  += (c_intrinsic + Cwire + c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+
+  /* TODO: peripheral-- Priority Encoder, usually this is not necessary in processor components*/
+
+  power_matchline.searchOp.dynamic = dynSearchEng;
+
+  //leakage in one subarray
+  double Iport     = cmos_Isub_leakage(g_tp.cam.cell_a_w, 0,  1, nmos, false, true);//TODO: how much is the idle time? just by *2?
+  double Iport_erp = cmos_Isub_leakage(g_tp.cam.cell_a_w, 0,  2, nmos, false, true);
+  double Icell     = cmos_Isub_leakage(g_tp.cam.cell_nmos_w, g_tp.cam.cell_pmos_w, 1, inv, false, true)*2;
+  double Icell_comparator = cmos_Isub_leakage(Wdummyn, Wdummyn, 1, inv, false, true)*2;//approx XOR with Inv
+
+  leak_power_cc_inverters_sram_cell         = Icell * g_tp.cam_cell.Vdd;
+  leak_comparator_cam_cell                  = Icell_comparator * g_tp.cam_cell.Vdd;
+  leak_power_acc_tr_RW_or_WR_port_sram_cell = Iport * g_tp.cam_cell.Vdd;
+  leak_power_RD_port_sram_cell              = Iport_erp * g_tp.cam_cell.Vdd;
+  leak_power_SCHP_port_sram_cell            = 0;//search port and r/w port are sperate, therefore no access txs in search ports
+
+  power_matchline.searchOp.leakage += leak_power_cc_inverters_sram_cell +
+    leak_comparator_cam_cell +
+    leak_power_acc_tr_RW_or_WR_port_sram_cell +
+    leak_power_acc_tr_RW_or_WR_port_sram_cell * (RWP + EWP - 1) +
+    leak_power_RD_port_sram_cell * ERP +
+    leak_power_SCHP_port_sram_cell*SCHP;
+//  power_matchline.searchOp.leakage += leak_comparator_cam_cell;
+  power_matchline.searchOp.leakage *= (subarray.num_rows+1) * subarray.num_cols_fa_cam;//TODO:dumy line precise
+  power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(0, Wfaprechp, 1, pmos) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(Waddrnandn, Waddrnandp, 2, nand) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(Wfanorn, Wfanorp,2, nor) * g_tp.cam_cell.Vdd;
+  //In idle states, the hit/miss txs are closed (on) therefore no Isub
+  power_matchline.searchOp.leakage += 0;// subarray.num_rows * cmos_Isub_leakage(W_hit_miss_n, 0,1, nmos) * g_tp.cam_cell.Vdd+
+    // + cmos_Isub_leakage(0, W_hit_miss_p,1, pmos) * g_tp.cam_cell.Vdd;
+
+  //in idle state, Ig_on only possibly exist in access transistors of read only ports
+  double Ig_port_erp = cmos_Ig_leakage(g_tp.cam.cell_a_w, 0, 1, nmos, false, true);
+  double Ig_cell     = cmos_Ig_leakage(g_tp.cam.cell_nmos_w, g_tp.cam.cell_pmos_w, 1, inv, false, true)*2;
+  double Ig_cell_comparator = cmos_Ig_leakage(Wdummyn, Wdummyn, 1, inv, false, true)*2;// cmos_Ig_leakage(Wdummyn, 0, 2, nmos)*2;
+
+  gate_leak_comparator_cam_cell          = Ig_cell_comparator* g_tp.cam_cell.Vdd;
+  gate_leak_power_cc_inverters_sram_cell = Ig_cell*g_tp.cam_cell.Vdd;
+  gate_leak_power_RD_port_sram_cell      = Ig_port_erp*g_tp.sram_cell.Vdd;
+  gate_leak_power_SCHP_port_sram_cell    = 0;
+
+  //cout<<"power_matchline.searchOp.leakage"<<power_matchline.searchOp.leakage<<endl;
+
+  power_matchline.searchOp.gate_leakage += gate_leak_power_cc_inverters_sram_cell;
+  power_matchline.searchOp.gate_leakage += gate_leak_comparator_cam_cell;
+  power_matchline.searchOp.gate_leakage += gate_leak_power_SCHP_port_sram_cell*SCHP + gate_leak_power_RD_port_sram_cell * ERP;
+  power_matchline.searchOp.gate_leakage *= (subarray.num_rows+1) * subarray.num_cols_fa_cam;//TODO:dumy line precise
+  power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(0, Wfaprechp,1, pmos) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(Waddrnandn, Waddrnandp, 2, nand) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(Wfanorn, Wfanorp, 2, nor) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.gate_leakage += subarray.num_rows * cmos_Ig_leakage(W_hit_miss_n, 0,1, nmos) * g_tp.cam_cell.Vdd+
+                                       + cmos_Ig_leakage(0, W_hit_miss_p,1, pmos) * g_tp.cam_cell.Vdd;
+
+
+   return out_time_ramp;
+}
+
+
+double Mat::width_write_driver_or_write_mux()
+{
+  // calculate resistance of SRAM cell pull-up PMOS transistor
+  // cam and sram have same cell trasistor properties
+  double R_sram_cell_pull_up_tr  = tr_R_on(g_tp.sram.cell_pmos_w, NCH, 1, is_dram, true);
+  double R_access_tr             = tr_R_on(g_tp.sram.cell_a_w,    NCH, 1, is_dram, true);
+  double target_R_write_driver_and_mux = (2 * R_sram_cell_pull_up_tr - R_access_tr) / 2;
+  double width_write_driver_nmos = R_to_w(target_R_write_driver_and_mux, NCH, is_dram);
+
+  return width_write_driver_nmos;
+}
+
+
+
+double Mat::compute_comparators_height(
+    int tagbits,
+    int number_ways_in_mat,
+    double subarray_mem_cell_area_width)
+{
+  double nand2_area = compute_gate_area(NAND, 2, 0, g_tp.w_comp_n, g_tp.cell_h_def);
+  double cumulative_area = nand2_area * number_ways_in_mat * tagbits / 4;
+  return cumulative_area / subarray_mem_cell_area_width;
+}
+
+
+
+double Mat::compute_bitline_delay(double inrisetime)
+{
+  double V_b_pre, v_th_mem_cell, V_wl;
+  double tstep;
+  double dynRdEnergy = 0.0, dynWriteEnergy = 0.0;
+  double R_cell_pull_down=0.0, R_cell_acc =0.0, r_dev=0.0;
+  int deg_senseamp_muxing = dp.Ndsam_lev_1 * dp.Ndsam_lev_2;
+
+  double R_b_metal = camFlag? cam_cell.h:cell.h * g_tp.wire_local.R_per_um;
+  double R_bl      = subarray.num_rows * R_b_metal;
+  double C_bl      = subarray.C_bl;
+
+  // TODO: no leakage for DRAMs?
+  double leak_power_cc_inverters_sram_cell = 0;
+  double gate_leak_power_cc_inverters_sram_cell = 0;
+  double leak_power_acc_tr_RW_or_WR_port_sram_cell = 0;
+  double leak_power_RD_port_sram_cell = 0;
+  double gate_leak_power_RD_port_sram_cell = 0;
+
+  if (is_dram == true)
+  {
+    V_b_pre = g_tp.dram.Vbitpre;
+    v_th_mem_cell = g_tp.dram_acc.Vth;
+    V_wl = g_tp.vpp;
+    //The access transistor is not folded. So we just need to specify a threshold value for the
+    //folding width that is equal to or greater than Wmemcella.
+    R_cell_acc = tr_R_on(g_tp.dram.cell_a_w, NCH, 1, true, true);
+    r_dev = g_tp.dram_cell_Vdd / g_tp.dram_cell_I_on + R_bl / 2;
+  }
+  else
+  { //SRAM
+    V_b_pre = g_tp.sram.Vbitpre;
+    v_th_mem_cell = g_tp.sram_cell.Vth;
+    V_wl = g_tp.sram_cell.Vdd;
+    R_cell_pull_down = tr_R_on(g_tp.sram.cell_nmos_w, NCH, 1, false, true);
+    R_cell_acc = tr_R_on(g_tp.sram.cell_a_w, NCH, 1, false, true);
+
+    //Leakage current of an SRAM cell
+    double Iport     = cmos_Isub_leakage(g_tp.sram.cell_a_w, 0,  1, nmos,false, true);//TODO: how much is the idle time? just by *2?
+    double Iport_erp = cmos_Isub_leakage(g_tp.sram.cell_a_w, 0,  2, nmos,false, true);
+    double Icell     = cmos_Isub_leakage(g_tp.sram.cell_nmos_w, g_tp.sram.cell_pmos_w, 1, inv,false, true)*2;//two invs per cell
+
+    leak_power_cc_inverters_sram_cell         = Icell * g_tp.sram_cell.Vdd;
+    leak_power_acc_tr_RW_or_WR_port_sram_cell = Iport * g_tp.sram_cell.Vdd;
+    leak_power_RD_port_sram_cell              = Iport_erp * g_tp.sram_cell.Vdd;
+
+
+    //in idle state, Ig_on only possibly exist in access transistors of read only ports
+    double Ig_port_erp   = cmos_Ig_leakage(g_tp.sram.cell_a_w, 0, 1, nmos,false, true);
+    double Ig_cell   = cmos_Ig_leakage(g_tp.sram.cell_nmos_w, g_tp.sram.cell_pmos_w, 1, inv,false, true);
+
+    gate_leak_power_cc_inverters_sram_cell = Ig_cell*g_tp.sram_cell.Vdd;
+    gate_leak_power_RD_port_sram_cell      = Ig_port_erp*g_tp.sram_cell.Vdd;
+  }
+
+
+  double C_drain_bit_mux = drain_C_(g_tp.w_nmos_b_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w / (2 *(RWP + ERP + SCHP)), is_dram);
+  double R_bit_mux = tr_R_on(g_tp.w_nmos_b_mux, NCH, 1, is_dram);
+  double C_drain_sense_amp_iso = drain_C_(g_tp.w_iso, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+  double R_sense_amp_iso = tr_R_on(g_tp.w_iso, PCH, 1, is_dram);
+  double C_sense_amp_latch = gate_C(g_tp.w_sense_p + g_tp.w_sense_n, 0, is_dram) +
+    drain_C_(g_tp.w_sense_n, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_sense_p, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+  double C_drain_sense_amp_mux = drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+
+  if (is_dram)
+  {
+    double fraction = dp.V_b_sense / ((g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl));
+    tstep = 2.3 * fraction * r_dev *
+      (g_tp.dram_cell_C * (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux)) /
+      (g_tp.dram_cell_C + (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux));
+    delay_writeback = tstep;
+    dynRdEnergy += (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) *
+      (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd /* subarray.num_cols * num_subarrays_per_mat*/;
+    dynWriteEnergy += (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch) *
+      (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd /* subarray.num_cols * num_subarrays_per_mat*/ * num_act_mats_hor_dir*100;
+    per_bitline_read_energy = (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) *
+      (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd;
+  }
+  else
+  {
+    double tau;
+
+    if (deg_bl_muxing > 1)
+    {
+      tau = (R_cell_pull_down + R_cell_acc) *
+        (C_bl + 2*C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) +
+        R_bl * (C_bl/2 + 2*C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) +
+        R_bit_mux * (C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) +
+        R_sense_amp_iso * (C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux);
+      dynRdEnergy += (C_bl + 2 * C_drain_bit_mux) * 2 * dp.V_b_sense * g_tp.sram_cell.Vdd /*
+        subarray.num_cols * num_subarrays_per_mat*/;
+      dynRdEnergy += (2 * C_drain_sense_amp_iso + C_sense_amp_latch +  C_drain_sense_amp_mux) *
+        2 * dp.V_b_sense * g_tp.sram_cell.Vdd * (1.0/*subarray.num_cols * num_subarrays_per_mat*/ / deg_bl_muxing);
+      dynWriteEnergy += ((1.0/*subarray.num_cols *num_subarrays_per_mat*/ / deg_bl_muxing) / deg_senseamp_muxing) *
+          num_act_mats_hor_dir * (C_bl + 2*C_drain_bit_mux) * g_tp.sram_cell.Vdd * g_tp.sram_cell.Vdd*2;
+      //Write Ops are differential for SRAM
+    }
+    else
+    {
+      tau = (R_cell_pull_down + R_cell_acc) *
+        (C_bl + C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) + R_bl * C_bl / 2 +
+        R_sense_amp_iso * (C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux);
+      dynRdEnergy += (C_bl + 2 * C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) *
+        2 * dp.V_b_sense * g_tp.sram_cell.Vdd /* subarray.num_cols * num_subarrays_per_mat*/;
+      dynWriteEnergy += (((1.0/*subarray.num_cols * num_subarrays_per_mat*/ / deg_bl_muxing) / deg_senseamp_muxing) *
+          num_act_mats_hor_dir * C_bl) * g_tp.sram_cell.Vdd * g_tp.sram_cell.Vdd*2;
+
+    }
+    tstep = tau * log(V_b_pre / (V_b_pre - dp.V_b_sense));
+    power_bitline.readOp.leakage =
+      leak_power_cc_inverters_sram_cell +
+      leak_power_acc_tr_RW_or_WR_port_sram_cell +
+      leak_power_acc_tr_RW_or_WR_port_sram_cell * (RWP + EWP - 1) +
+      leak_power_RD_port_sram_cell * ERP;
+    power_bitline.readOp.gate_leakage = gate_leak_power_cc_inverters_sram_cell +
+      gate_leak_power_RD_port_sram_cell * ERP;
+
+  }
+
+//  cout<<"leak_power_cc_inverters_sram_cell"<<leak_power_cc_inverters_sram_cell<<endl;
+//  cout<<"leak_power_acc_tr_RW_or_WR_port_sram_cell"<<leak_power_acc_tr_RW_or_WR_port_sram_cell<<endl;
+//  cout<<"leak_power_acc_tr_RW_or_WR_port_sram_cell"<<leak_power_acc_tr_RW_or_WR_port_sram_cell<<endl;
+//  cout<<"leak_power_RD_port_sram_cell"<<leak_power_RD_port_sram_cell<<endl;
+
+
+  /* take input rise time into account */
+  double m = V_wl / inrisetime;
+  if (tstep <= (0.5 * (V_wl - v_th_mem_cell) / m))
+  {
+    delay_bitline = sqrt(2 * tstep * (V_wl - v_th_mem_cell)/ m);
+  }
+  else
+  {
+    delay_bitline = tstep + (V_wl - v_th_mem_cell) / (2 * m);
+  }
+
+  bool is_fa = (dp.fully_assoc) ? true : false;
+
+  if (dp.is_tag == false || is_fa == false)
+  {
+    power_bitline.readOp.dynamic  = dynRdEnergy;
+    power_bitline.writeOp.dynamic = dynWriteEnergy;
+  }
+
+  double outrisetime = 0;
+  return outrisetime;
+}
+
+
+
+double Mat::compute_sa_delay(double inrisetime)
+{
+  //int num_sa_subarray = subarray.num_cols / deg_bl_muxing; //in a subarray
+
+  //Bitline circuitry leakage.
+  double Iiso     = simplified_pmos_leakage(g_tp.w_iso, is_dram);
+  double IsenseEn = simplified_nmos_leakage(g_tp.w_sense_en, is_dram);
+  double IsenseN  = simplified_nmos_leakage(g_tp.w_sense_n, is_dram);
+  double IsenseP  = simplified_pmos_leakage(g_tp.w_sense_p, is_dram);
+
+  double lkgIdlePh  = IsenseEn;//+ 2*IoBufP;
+  //double lkgWritePh = Iiso + IsenseEn;// + 2*IoBufP + 2*Ipch;
+  double lkgReadPh  = Iiso + IsenseN + IsenseP;//+ IoBufN + IoBufP + 2*IsPch ;
+  //double lkgRead = lkgReadPh * num_sa_subarray * 4 * num_act_mats_hor_dir +
+  //    lkgIdlePh * num_sa_subarray * 4 * (num_mats - num_act_mats_hor_dir);
+  double lkgIdle = lkgIdlePh /*num_sa_subarray * num_subarrays_per_mat*/;
+  leak_power_sense_amps_closed_page_state = lkgIdlePh * g_tp.peri_global.Vdd /* num_sa_subarray * num_subarrays_per_mat*/;
+  leak_power_sense_amps_open_page_state   = lkgReadPh * g_tp.peri_global.Vdd /* num_sa_subarray * num_subarrays_per_mat*/;
+
+  // sense amplifier has to drive logic in "data out driver" and sense precharge load.
+  // load seen by sense amp. New delay model for sense amp that is sensitive to both the output time
+  //constant as well as the magnitude of input differential voltage.
+  double C_ld = gate_C(g_tp.w_sense_p + g_tp.w_sense_n, 0, is_dram) +
+    drain_C_(g_tp.w_sense_n, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_sense_p, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_iso,PCH,1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+  double tau = C_ld / g_tp.gm_sense_amp_latch;
+  delay_sa = tau * log(g_tp.peri_global.Vdd / dp.V_b_sense);
+  power_sa.readOp.dynamic = C_ld * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd /* num_sa_subarray
+                            num_subarrays_per_mat * num_act_mats_hor_dir*/;
+  power_sa.readOp.leakage = lkgIdle * g_tp.peri_global.Vdd;
+
+  double outrisetime = 0;
+  return outrisetime;
+}
+
+
+
+double Mat::compute_subarray_out_drv(double inrisetime)
+{
+  double C_ld, rd, tf, this_delay;
+  double p_to_n_sz_r = pmos_to_nmos_sz_ratio(is_dram);
+
+  // delay of signal through pass-transistor of first level of sense-amp mux to input of inverter-buffer.
+  rd = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram);
+  C_ld = dp.Ndsam_lev_1 * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    gate_C(g_tp.min_w_nmos_ + p_to_n_sz_r * g_tp.min_w_nmos_, 0.0, is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage += 0;  // for now, let leakage of the pass transistor be 0
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.w_nmos_sa_mux, 0, 1, nmos)* g_tp.peri_global.Vdd;
+  // delay of signal through inverter-buffer to second level of sense-amp mux.
+  // internal delay of buffer
+  rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1, is_dram);
+  C_ld = drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(p_to_n_sz_r * g_tp.min_w_nmos_, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    gate_C(g_tp.min_w_nmos_ + p_to_n_sz_r * g_tp.min_w_nmos_, 0.0, is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic      += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage      += cmos_Isub_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv, is_dram)* g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd;
+
+  // inverter driving drain of pass transistor of second level of sense-amp mux.
+  rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1, is_dram);
+  C_ld = drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(p_to_n_sz_r * g_tp.min_w_nmos_, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP + SCHP), is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic      += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage      += cmos_Isub_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd;
+
+
+  // delay of signal through pass-transistor to input of subarray output driver.
+  rd = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram);
+  C_ld = dp.Ndsam_lev_2 * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP + SCHP), is_dram) +
+    //gate_C(subarray_out_wire->repeater_size * g_tp.min_w_nmos_ * (1 + p_to_n_sz_r), 0.0, is_dram);
+    gate_C(subarray_out_wire->repeater_size *(subarray_out_wire->wire_length/subarray_out_wire->repeater_spacing) * g_tp.min_w_nmos_ * (1 + p_to_n_sz_r), 0.0, is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage += 0;  // for now, let leakage of the pass transistor be 0
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.w_nmos_sa_mux, 0, 1, nmos)* g_tp.peri_global.Vdd;
+
+
+  return inrisetime;
+}
+
+
+
+double Mat::compute_comparator_delay(double inrisetime)
+{
+  int A = g_ip->tag_assoc;
+
+  int tagbits_ = dp.tagbits / 4; // Assuming there are 4 quarter comparators. input tagbits is already
+  // a multiple of 4.
+
+  /* First Inverter */
+  double Ceq = gate_C(g_tp.w_comp_inv_n2+g_tp.w_comp_inv_p2, 0, is_dram) +
+               drain_C_(g_tp.w_comp_inv_p1, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+               drain_C_(g_tp.w_comp_inv_n1, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  double Req = tr_R_on(g_tp.w_comp_inv_p1, PCH, 1, is_dram);
+  double tf  = Req*Ceq;
+  double st1del = horowitz(inrisetime,tf,VTHCOMPINV,VTHCOMPINV,FALL);
+  double nextinputtime = st1del/VTHCOMPINV;
+  power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+
+  //For each degree of associativity
+  //there are 4 such quarter comparators
+  double lkgCurrent   = cmos_Isub_leakage(g_tp.w_comp_inv_n1, g_tp.w_comp_inv_p1, 1, inv, is_dram)* 4 * A;
+  double gatelkgCurrent = cmos_Ig_leakage(g_tp.w_comp_inv_n1, g_tp.w_comp_inv_p1, 1, inv, is_dram)* 4 * A;
+  /* Second Inverter */
+  Ceq = gate_C(g_tp.w_comp_inv_n3+g_tp.w_comp_inv_p3, 0, is_dram) +
+    drain_C_(g_tp.w_comp_inv_p2, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(g_tp.w_comp_inv_n2, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  Req = tr_R_on(g_tp.w_comp_inv_n2, NCH, 1, is_dram);
+  tf = Req*Ceq;
+  double st2del = horowitz(nextinputtime,tf,VTHCOMPINV,VTHCOMPINV,RISE);
+  nextinputtime = st2del/(1.0-VTHCOMPINV);
+  power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_inv_n2, g_tp.w_comp_inv_p2, 1, inv, is_dram)* 4 * A;
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_inv_n2, g_tp.w_comp_inv_p2, 1, inv, is_dram)* 4 * A;
+
+  /* Third Inverter */
+  Ceq = gate_C(g_tp.w_eval_inv_n+g_tp.w_eval_inv_p, 0, is_dram) +
+    drain_C_(g_tp.w_comp_inv_p3, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(g_tp.w_comp_inv_n3, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  Req = tr_R_on(g_tp.w_comp_inv_p3, PCH, 1, is_dram);
+  tf = Req*Ceq;
+  double st3del = horowitz(nextinputtime,tf,VTHCOMPINV,VTHEVALINV,FALL);
+  nextinputtime = st3del/(VTHEVALINV);
+  power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_inv_n3, g_tp.w_comp_inv_p3, 1, inv, is_dram)* 4 * A;
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_inv_n3, g_tp.w_comp_inv_p3, 1, inv, is_dram)* 4 * A;
+
+  /* Final Inverter (virtual ground driver) discharging compare part */
+  double r1 = tr_R_on(g_tp.w_comp_n,NCH,2, is_dram);
+  double r2 = tr_R_on(g_tp.w_eval_inv_n,NCH,1, is_dram); /* was switch */
+  double c2 = (tagbits_)*(drain_C_(g_tp.w_comp_n,NCH,1, 1, g_tp.cell_h_def, is_dram) +
+                   drain_C_(g_tp.w_comp_n,NCH,2, 1, g_tp.cell_h_def, is_dram)) +
+       drain_C_(g_tp.w_eval_inv_p,PCH,1, 1, g_tp.cell_h_def, is_dram) +
+       drain_C_(g_tp.w_eval_inv_n,NCH,1, 1, g_tp.cell_h_def, is_dram);
+  double c1 = (tagbits_)*(drain_C_(g_tp.w_comp_n,NCH,1, 1, g_tp.cell_h_def, is_dram) +
+                          drain_C_(g_tp.w_comp_n,NCH,2, 1, g_tp.cell_h_def, is_dram)) +
+    drain_C_(g_tp.w_comp_p,PCH,1, 1, g_tp.cell_h_def, is_dram) +
+    gate_C(WmuxdrvNANDn+WmuxdrvNANDp,0, is_dram);
+  power_comparator.readOp.dynamic += 0.5 * c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+  power_comparator.readOp.dynamic += c1 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *  (A - 1);
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_eval_inv_n, g_tp.w_eval_inv_p, 1, inv, is_dram)* 4 * A;
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_n, g_tp.w_comp_n, 1, inv, is_dram)* 4 * A;  // stack factor of 0.2
+
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_eval_inv_n, g_tp.w_eval_inv_p, 1, inv, is_dram)* 4 * A;
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_n, g_tp.w_comp_n, 1, inv, is_dram)* 4 * A;//for gate leakage this equals to a inverter
+
+  /* time to go to threshold of mux driver */
+  double tstep = (r2*c2+(r1+r2)*c1)*log(1.0/VTHMUXNAND);
+  /* take into account non-zero input rise time */
+  double m = g_tp.peri_global.Vdd/nextinputtime;
+  double Tcomparatorni;
+
+  if((tstep) <= (0.5*(g_tp.peri_global.Vdd-g_tp.peri_global.Vth)/m))
+  {
+    double a = m;
+    double b = 2*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth);
+    double c = -2*(tstep)*(g_tp.peri_global.Vdd-g_tp.peri_global.Vth)+1/m*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth)*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth);
+    Tcomparatorni = (-b+sqrt(b*b-4*a*c))/(2*a);
+  }
+  else
+  {
+    Tcomparatorni = (tstep) + (g_tp.peri_global.Vdd+g_tp.peri_global.Vth)/(2*m) - (g_tp.peri_global.Vdd*VTHEVALINV)/m;
+  }
+  delay_comparator = Tcomparatorni+st1del+st2del+st3del;
+  power_comparator.readOp.leakage = lkgCurrent * g_tp.peri_global.Vdd;
+  power_comparator.readOp.gate_leakage = gatelkgCurrent * g_tp.peri_global.Vdd;
+
+  return Tcomparatorni / (1.0 - VTHMUXNAND);;
+}
+
+
+
+void Mat::compute_power_energy()
+{
+        //for cam and FA, power.readOp is the plain read power, power.searchOp is the associative search related power
+    //when search all subarrays and all mats are fully active
+        //when plain read/write only one subarray in a single mat is active.
+
+    // add energy consumed in predecoder drivers. This unit is shared by all subarrays in a mat.
+  power.readOp.dynamic += r_predec->power.readOp.dynamic +
+                          b_mux_predec->power.readOp.dynamic +
+                          sa_mux_lev_1_predec->power.readOp.dynamic +
+                          sa_mux_lev_2_predec->power.readOp.dynamic;
+
+  // add energy consumed in decoders
+  power_row_decoders.readOp.dynamic        = row_dec->power.readOp.dynamic;
+  if (!(is_fa||pure_cam))
+    power_row_decoders.readOp.dynamic        *= num_subarrays_per_mat;
+
+  // add energy consumed in bitline prechagers, SAs, and bitlines
+  if (!(is_fa||pure_cam))
+  {
+          // add energy consumed in bitline prechagers
+          power_bl_precharge_eq_drv.readOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic;
+          power_bl_precharge_eq_drv.readOp.dynamic *= num_subarrays_per_mat;
+
+          //Add sense amps energy
+          num_sa_subarray = subarray.num_cols / deg_bl_muxing;
+          power_sa.readOp.dynamic *= num_sa_subarray*num_subarrays_per_mat ;
+
+          // add energy consumed in bitlines
+          //cout<<"bitline power"<<power_bitline.readOp.dynamic<<endl;
+          power_bitline.readOp.dynamic *= num_subarrays_per_mat*subarray.num_cols;
+          power_bitline.writeOp.dynamic *= num_subarrays_per_mat*subarray.num_cols;
+          //cout<<"bitline power"<<power_bitline.readOp.dynamic<<"subarray"<<num_subarrays_per_mat<<"cols"<<subarray.num_cols<<endl;
+          //Add subarray output energy
+          power_subarray_out_drv.readOp.dynamic =
+                  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat;
+
+          power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic +
+                                  power_sa.readOp.dynamic +
+                                  power_bitline.readOp.dynamic +
+                                  power_subarray_out_drv.readOp.dynamic;
+
+          power.readOp.dynamic += power_row_decoders.readOp.dynamic +
+                                  bit_mux_dec->power.readOp.dynamic +
+                                  sa_mux_lev_1_dec->power.readOp.dynamic +
+                                  sa_mux_lev_2_dec->power.readOp.dynamic +
+                                  power_comparator.readOp.dynamic;
+  }
+
+  else if (is_fa)
+  {
+          //for plain read/write only one subarray in a mat is active
+          // add energy consumed in bitline prechagers
+          power_bl_precharge_eq_drv.readOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic
+                   + cam_bl_precharge_eq_drv->power.readOp.dynamic;
+          power_bl_precharge_eq_drv.searchOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic;
+
+          //Add sense amps energy
+          num_sa_subarray = (subarray.num_cols_fa_cam + subarray.num_cols_fa_ram)/ deg_bl_muxing;
+          num_sa_subarray_search = subarray.num_cols_fa_ram/ deg_bl_muxing;
+          power_sa.searchOp.dynamic = power_sa.readOp.dynamic*num_sa_subarray_search;
+          power_sa.readOp.dynamic *= num_sa_subarray;
+
+
+          // add energy consumed in bitlines
+          power_bitline.searchOp.dynamic = power_bitline.readOp.dynamic;
+          power_bitline.readOp.dynamic *= (subarray.num_cols_fa_cam+subarray.num_cols_fa_ram);
+          power_bitline.writeOp.dynamic *= (subarray.num_cols_fa_cam+subarray.num_cols_fa_ram);
+          power_bitline.searchOp.dynamic *= subarray.num_cols_fa_ram;
+
+          //Add subarray output energy
+      power_subarray_out_drv.searchOp.dynamic =
+                  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_so_b_mat;
+          power_subarray_out_drv.readOp.dynamic =
+                  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat;
+
+
+          power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic +
+                                  power_sa.readOp.dynamic +
+                                  power_bitline.readOp.dynamic +
+                                  power_subarray_out_drv.readOp.dynamic;
+
+          power.readOp.dynamic += power_row_decoders.readOp.dynamic +
+                                  bit_mux_dec->power.readOp.dynamic +
+                                  sa_mux_lev_1_dec->power.readOp.dynamic +
+                                  sa_mux_lev_2_dec->power.readOp.dynamic +
+                                  power_comparator.readOp.dynamic;
+
+          //add energy consumed inside cam
+          power_matchline.searchOp.dynamic *= num_subarrays_per_mat;
+          power_searchline_precharge = sl_precharge_eq_drv->power;
+      power_searchline_precharge.searchOp.dynamic = power_searchline_precharge.readOp.dynamic * num_subarrays_per_mat;
+      power_searchline = sl_data_drv->power;
+      power_searchline.searchOp.dynamic = power_searchline.readOp.dynamic*subarray.num_cols_fa_cam* num_subarrays_per_mat;;
+      power_matchline_precharge  = ml_precharge_drv->power;
+      power_matchline_precharge.searchOp.dynamic = power_matchline_precharge.readOp.dynamic* num_subarrays_per_mat;
+      power_ml_to_ram_wl_drv= ml_to_ram_wl_drv->power;
+      power_ml_to_ram_wl_drv.searchOp.dynamic= ml_to_ram_wl_drv->power.readOp.dynamic;
+
+          power_cam_all_active.searchOp.dynamic = power_matchline.searchOp.dynamic;
+          power_cam_all_active.searchOp.dynamic +=power_searchline_precharge.searchOp.dynamic;
+          power_cam_all_active.searchOp.dynamic +=power_searchline.searchOp.dynamic;
+          power_cam_all_active.searchOp.dynamic +=power_matchline_precharge.searchOp.dynamic;
+
+          power.searchOp.dynamic += power_cam_all_active.searchOp.dynamic;
+          //power.searchOp.dynamic += ml_to_ram_wl_drv->power.readOp.dynamic;
+
+  }
+  else
+  {
+          // add energy consumed in bitline prechagers
+          power_bl_precharge_eq_drv.readOp.dynamic = cam_bl_precharge_eq_drv->power.readOp.dynamic;
+          //power_bl_precharge_eq_drv.readOp.dynamic *= num_subarrays_per_mat;
+          //power_bl_precharge_eq_drv.searchOp.dynamic = cam_bl_precharge_eq_drv->power.readOp.dynamic;
+          //power_bl_precharge_eq_drv.searchOp.dynamic *= num_subarrays_per_mat;
+
+          //Add sense amps energy
+          num_sa_subarray = subarray.num_cols_fa_cam/ deg_bl_muxing;
+          power_sa.readOp.dynamic *= num_sa_subarray;//*num_subarrays_per_mat;
+          power_sa.searchOp.dynamic = 0;
+
+          power_bitline.readOp.dynamic *= subarray.num_cols_fa_cam;
+          power_bitline.searchOp.dynamic = 0;
+          power_bitline.writeOp.dynamic *= subarray.num_cols_fa_cam;
+
+          power_subarray_out_drv.searchOp.dynamic =
+                  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_so_b_mat;
+          power_subarray_out_drv.readOp.dynamic =
+                          (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat;
+
+          power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic +
+                                  power_sa.readOp.dynamic +
+                                  power_bitline.readOp.dynamic +
+                                  power_subarray_out_drv.readOp.dynamic;
+
+          power.readOp.dynamic += power_row_decoders.readOp.dynamic +
+                                  bit_mux_dec->power.readOp.dynamic +
+                                  sa_mux_lev_1_dec->power.readOp.dynamic +
+                                  sa_mux_lev_2_dec->power.readOp.dynamic +
+                                  power_comparator.readOp.dynamic;
+
+
+          ////add energy consumed inside cam
+          power_matchline.searchOp.dynamic *= num_subarrays_per_mat;
+          power_searchline_precharge = sl_precharge_eq_drv->power;
+      power_searchline_precharge.searchOp.dynamic = power_searchline_precharge.readOp.dynamic * num_subarrays_per_mat;
+      power_searchline = sl_data_drv->power;
+      power_searchline.searchOp.dynamic = power_searchline.readOp.dynamic*subarray.num_cols_fa_cam* num_subarrays_per_mat;;
+      power_matchline_precharge  = ml_precharge_drv->power;
+      power_matchline_precharge.searchOp.dynamic = power_matchline_precharge.readOp.dynamic* num_subarrays_per_mat;
+      power_ml_to_ram_wl_drv= ml_to_ram_wl_drv->power;
+      power_ml_to_ram_wl_drv.searchOp.dynamic= ml_to_ram_wl_drv->power.readOp.dynamic;
+
+          power_cam_all_active.searchOp.dynamic = power_matchline.searchOp.dynamic;
+          power_cam_all_active.searchOp.dynamic +=power_searchline_precharge.searchOp.dynamic;
+          power_cam_all_active.searchOp.dynamic +=power_searchline.searchOp.dynamic;
+          power_cam_all_active.searchOp.dynamic +=power_matchline_precharge.searchOp.dynamic;
+
+          power.searchOp.dynamic += power_cam_all_active.searchOp.dynamic;
+          //power.searchOp.dynamic += ml_to_ram_wl_drv->power.readOp.dynamic;
+
+  }
+
+
+
+  // calculate leakage power
+  if (!(is_fa || pure_cam))
+  {
+        int number_output_drivers_subarray = num_sa_subarray / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2);
+
+        power_bitline.readOp.leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+    power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+    power_sa.readOp.leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP);
+
+    //num_sa_subarray             = subarray.num_cols / deg_bl_muxing;
+    power_subarray_out_drv.readOp.leakage =
+      (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) *
+      number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP);
+
+    power.readOp.leakage += power_bitline.readOp.leakage +
+                            power_bl_precharge_eq_drv.readOp.leakage +
+                            power_sa.readOp.leakage +
+                            power_subarray_out_drv.readOp.leakage;
+    //cout<<"leakage"<<power.readOp.leakage<<endl;
+
+    power_comparator.readOp.leakage *= num_do_b_mat * (RWP + ERP);
+    power.readOp.leakage += power_comparator.readOp.leakage;
+
+    //cout<<"leakage1"<<power.readOp.leakage<<endl;
+
+    // leakage power
+    power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat;
+    power_bit_mux_decoders.readOp.leakage      = bit_mux_dec->power.readOp.leakage * deg_bl_muxing;
+    power_sa_mux_lev_1_decoders.readOp.leakage = sa_mux_lev_1_dec->power.readOp.leakage * dp.Ndsam_lev_1;
+    power_sa_mux_lev_2_decoders.readOp.leakage = sa_mux_lev_2_dec->power.readOp.leakage * dp.Ndsam_lev_2;
+
+    power.readOp.leakage += r_predec->power.readOp.leakage +
+                          b_mux_predec->power.readOp.leakage +
+                          sa_mux_lev_1_predec->power.readOp.leakage +
+                          sa_mux_lev_2_predec->power.readOp.leakage +
+                          power_row_decoders.readOp.leakage +
+                          power_bit_mux_decoders.readOp.leakage +
+                          power_sa_mux_lev_1_decoders.readOp.leakage +
+                          power_sa_mux_lev_2_decoders.readOp.leakage;
+    //cout<<"leakage2"<<power.readOp.leakage<<endl;
+
+    //++++Below is gate leakage
+        power_bitline.readOp.gate_leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+    power_bl_precharge_eq_drv.readOp.gate_leakage = bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+    power_sa.readOp.gate_leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP);
+
+    //num_sa_subarray             = subarray.num_cols / deg_bl_muxing;
+    power_subarray_out_drv.readOp.gate_leakage =
+      (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) *
+      number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP);
+
+    power.readOp.gate_leakage += power_bitline.readOp.gate_leakage +
+                            power_bl_precharge_eq_drv.readOp.gate_leakage +
+                            power_sa.readOp.gate_leakage +
+                            power_subarray_out_drv.readOp.gate_leakage;
+    //cout<<"leakage"<<power.readOp.leakage<<endl;
+
+    power_comparator.readOp.gate_leakage *= num_do_b_mat * (RWP + ERP);
+    power.readOp.gate_leakage += power_comparator.readOp.gate_leakage;
+
+    //cout<<"leakage1"<<power.readOp.gate_leakage<<endl;
+
+    // gate_leakage power
+    power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat;
+    power_bit_mux_decoders.readOp.gate_leakage      = bit_mux_dec->power.readOp.gate_leakage * deg_bl_muxing;
+    power_sa_mux_lev_1_decoders.readOp.gate_leakage = sa_mux_lev_1_dec->power.readOp.gate_leakage * dp.Ndsam_lev_1;
+    power_sa_mux_lev_2_decoders.readOp.gate_leakage = sa_mux_lev_2_dec->power.readOp.gate_leakage * dp.Ndsam_lev_2;
+
+    power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage +
+                          b_mux_predec->power.readOp.gate_leakage +
+                          sa_mux_lev_1_predec->power.readOp.gate_leakage +
+                          sa_mux_lev_2_predec->power.readOp.gate_leakage +
+                          power_row_decoders.readOp.gate_leakage +
+                          power_bit_mux_decoders.readOp.gate_leakage +
+                          power_sa_mux_lev_1_decoders.readOp.gate_leakage +
+                          power_sa_mux_lev_2_decoders.readOp.gate_leakage;
+  }
+  else if (is_fa)
+  {
+          int number_output_drivers_subarray = num_sa_subarray;// / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2);
+
+          power_bitline.readOp.leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+          power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+          power_bl_precharge_eq_drv.searchOp.leakage = cam_bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+          power_sa.readOp.leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+          //cout<<"leakage3"<<power.readOp.leakage<<endl;
+
+
+          power_subarray_out_drv.readOp.leakage =
+                  (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) *
+                  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+          power.readOp.leakage += power_bitline.readOp.leakage +
+                                  power_bl_precharge_eq_drv.readOp.leakage +
+                                  power_bl_precharge_eq_drv.searchOp.leakage +
+                                  power_sa.readOp.leakage +
+                                  power_subarray_out_drv.readOp.leakage;
+
+          //cout<<"leakage4"<<power.readOp.leakage<<endl;
+
+          // leakage power
+          power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat;
+          power.readOp.leakage += r_predec->power.readOp.leakage +
+                                  power_row_decoders.readOp.leakage;
+
+          //cout<<"leakage5"<<power.readOp.leakage<<endl;
+
+          //inside cam
+          power_cam_all_active.searchOp.leakage = power_matchline.searchOp.leakage;
+          power_cam_all_active.searchOp.leakage +=sl_precharge_eq_drv->power.readOp.leakage;
+          power_cam_all_active.searchOp.leakage +=sl_data_drv->power.readOp.leakage*subarray.num_cols_fa_cam;
+          power_cam_all_active.searchOp.leakage +=ml_precharge_drv->power.readOp.dynamic;
+          power_cam_all_active.searchOp.leakage *= num_subarrays_per_mat;
+
+          power.readOp.leakage += power_cam_all_active.searchOp.leakage;
+
+//       cout<<"leakage6"<<power.readOp.leakage<<endl;
+
+          //+++Below is gate leakage
+          power_bitline.readOp.gate_leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+          power_bl_precharge_eq_drv.readOp.gate_leakage = bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+          power_bl_precharge_eq_drv.searchOp.gate_leakage = cam_bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+          power_sa.readOp.gate_leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+          //cout<<"leakage3"<<power.readOp.gate_leakage<<endl;
+
+
+          power_subarray_out_drv.readOp.gate_leakage =
+                  (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) *
+                  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+          power.readOp.gate_leakage += power_bitline.readOp.gate_leakage +
+          power_bl_precharge_eq_drv.readOp.gate_leakage +
+          power_bl_precharge_eq_drv.searchOp.gate_leakage +
+          power_sa.readOp.gate_leakage +
+          power_subarray_out_drv.readOp.gate_leakage;
+
+          //cout<<"leakage4"<<power.readOp.gate_leakage<<endl;
+
+          // gate_leakage power
+          power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat;
+          power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage +
+          power_row_decoders.readOp.gate_leakage;
+
+          //cout<<"leakage5"<<power.readOp.gate_leakage<<endl;
+
+          //inside cam
+          power_cam_all_active.searchOp.gate_leakage = power_matchline.searchOp.gate_leakage;
+          power_cam_all_active.searchOp.gate_leakage +=sl_precharge_eq_drv->power.readOp.gate_leakage;
+          power_cam_all_active.searchOp.gate_leakage +=sl_data_drv->power.readOp.gate_leakage*subarray.num_cols_fa_cam;
+          power_cam_all_active.searchOp.gate_leakage +=ml_precharge_drv->power.readOp.dynamic;
+          power_cam_all_active.searchOp.gate_leakage *= num_subarrays_per_mat;
+
+          power.readOp.gate_leakage += power_cam_all_active.searchOp.gate_leakage;
+
+  }
+  else
+  {
+          int number_output_drivers_subarray = num_sa_subarray;// / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2);
+
+          //power_bitline.readOp.leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+          //power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+          power_bl_precharge_eq_drv.searchOp.leakage = cam_bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+          power_sa.readOp.leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+
+          power_subarray_out_drv.readOp.leakage =
+                  (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) *
+                  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+          power.readOp.leakage += //power_bitline.readOp.leakage +
+                                  //power_bl_precharge_eq_drv.readOp.leakage +
+                                  power_bl_precharge_eq_drv.searchOp.leakage +
+                                  power_sa.readOp.leakage +
+                                  power_subarray_out_drv.readOp.leakage;
+
+          // leakage power
+          power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat*(RWP + ERP + EWP);
+          power.readOp.leakage += r_predec->power.readOp.leakage +
+                                  power_row_decoders.readOp.leakage;
+
+          //inside cam
+          power_cam_all_active.searchOp.leakage = power_matchline.searchOp.leakage;
+          power_cam_all_active.searchOp.leakage +=sl_precharge_eq_drv->power.readOp.leakage;
+          power_cam_all_active.searchOp.leakage +=sl_data_drv->power.readOp.leakage*subarray.num_cols_fa_cam;
+          power_cam_all_active.searchOp.leakage +=ml_precharge_drv->power.readOp.dynamic;
+          power_cam_all_active.searchOp.leakage *= num_subarrays_per_mat;
+
+          power.readOp.leakage += power_cam_all_active.searchOp.leakage;
+
+          //+++Below is gate leakage
+          power_bl_precharge_eq_drv.searchOp.gate_leakage = cam_bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+          power_sa.readOp.gate_leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+
+          power_subarray_out_drv.readOp.gate_leakage =
+                  (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) *
+                  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+          power.readOp.gate_leakage += //power_bitline.readOp.gate_leakage +
+                                  //power_bl_precharge_eq_drv.readOp.gate_leakage +
+                                  power_bl_precharge_eq_drv.searchOp.gate_leakage +
+                                  power_sa.readOp.gate_leakage +
+                                  power_subarray_out_drv.readOp.gate_leakage;
+
+          // gate_leakage power
+          power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat*(RWP + ERP + EWP);
+          power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage +
+                                  power_row_decoders.readOp.gate_leakage;
+
+          //inside cam
+          power_cam_all_active.searchOp.gate_leakage = power_matchline.searchOp.gate_leakage;
+          power_cam_all_active.searchOp.gate_leakage +=sl_precharge_eq_drv->power.readOp.gate_leakage;
+          power_cam_all_active.searchOp.gate_leakage +=sl_data_drv->power.readOp.gate_leakage*subarray.num_cols_fa_cam;
+          power_cam_all_active.searchOp.gate_leakage +=ml_precharge_drv->power.readOp.dynamic;
+          power_cam_all_active.searchOp.gate_leakage *= num_subarrays_per_mat;
+
+          power.readOp.gate_leakage += power_cam_all_active.searchOp.gate_leakage;
+  }
+}
+
diff --git a/ext/mcpat/cacti/mat.h b/ext/mcpat/cacti/mat.h
new file mode 100755 (executable)
index 0000000..8d038be
--- /dev/null
@@ -0,0 +1,148 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __MAT_H__
+#define __MAT_H__
+
+#include "component.h"
+#include "decoder.h"
+#include "subarray.h"
+#include "wire.h"
+
+class Mat : public Component
+{
+  public:
+    Mat(const DynamicParameter & dyn_p);
+    ~Mat();
+    double compute_delays(double inrisetime);  // return outrisetime
+    void compute_power_energy();
+
+    const DynamicParameter & dp;
+
+    // TODO: clean up pointers and powerDefs below
+    Decoder * row_dec;
+    Decoder * bit_mux_dec;
+    Decoder * sa_mux_lev_1_dec;
+    Decoder * sa_mux_lev_2_dec;
+    PredecBlk * dummy_way_sel_predec_blk1;
+    PredecBlk * dummy_way_sel_predec_blk2;
+    PredecBlkDrv * way_sel_drv1;
+    PredecBlkDrv * dummy_way_sel_predec_blk_drv2;
+
+    Predec * r_predec;
+    Predec * b_mux_predec;
+    Predec * sa_mux_lev_1_predec;
+    Predec * sa_mux_lev_2_predec;
+
+    Wire   * subarray_out_wire;
+    Driver * bl_precharge_eq_drv;
+    Driver * cam_bl_precharge_eq_drv;//bitline pre-charge circuit is separated for CAM and RAM arrays.
+    Driver * ml_precharge_drv;//matchline prechange driver
+    Driver * sl_precharge_eq_drv;//searchline prechage driver
+    Driver * sl_data_drv;//search line data driver
+    Driver * ml_to_ram_wl_drv;//search line data driver
+
+
+    powerDef power_row_decoders;
+    powerDef power_bit_mux_decoders;
+    powerDef power_sa_mux_lev_1_decoders;
+    powerDef power_sa_mux_lev_2_decoders;
+    powerDef power_fa_cam;  // TODO: leakage power is not computed yet
+    powerDef power_bl_precharge_eq_drv;
+    powerDef power_subarray_out_drv;
+    powerDef power_cam_all_active;
+    powerDef power_searchline_precharge;
+    powerDef power_matchline_precharge;
+    powerDef power_ml_to_ram_wl_drv;
+
+    double   delay_fa_tag, delay_cam;
+    double   delay_before_decoder;
+    double   delay_bitline;
+    double   delay_wl_reset;
+    double   delay_bl_restore;
+
+    double   delay_searchline;
+    double   delay_matchchline;
+    double   delay_cam_sl_restore;
+    double   delay_cam_ml_reset;
+    double   delay_fa_ram_wl;
+
+    double   delay_hit_miss_reset;
+    double   delay_hit_miss;
+
+    Subarray subarray;
+    powerDef power_bitline, power_searchline, power_matchline;
+    double   per_bitline_read_energy;
+    int      deg_bl_muxing;
+    int      num_act_mats_hor_dir;
+    double   delay_writeback;
+    Area     cell,cam_cell;
+    bool     is_dram,is_fa, pure_cam, camFlag;
+    int      num_mats;
+    powerDef power_sa;
+    double   delay_sa;
+    double   leak_power_sense_amps_closed_page_state;
+    double   leak_power_sense_amps_open_page_state;
+    double   delay_subarray_out_drv;
+    double   delay_subarray_out_drv_htree;
+    double   delay_comparator;
+    powerDef power_comparator;
+    int      num_do_b_mat;
+    int      num_so_b_mat;
+    int      num_sa_subarray;
+    int      num_sa_subarray_search;
+    double   C_bl;
+
+    uint32_t num_subarrays_per_mat;  // the number of subarrays in a mat
+    uint32_t num_subarrays_per_row;  // the number of subarrays in a row of a mat
+
+
+  private:
+    double compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
+    double width_write_driver_or_write_mux();
+    double compute_comparators_height(int tagbits, int number_ways_in_mat, double subarray_mem_cell_area_w);
+    double compute_cam_delay(double inrisetime);
+    double compute_bitline_delay(double inrisetime);
+    double compute_sa_delay(double inrisetime);
+    double compute_subarray_out_drv(double inrisetime);
+    double compute_comparator_delay(double inrisetime);
+
+    int RWP;
+    int ERP;
+    int EWP;
+    int SCHP;
+};
+
+
+
+#endif
diff --git a/ext/mcpat/cacti/nuca.cc b/ext/mcpat/cacti/nuca.cc
new file mode 100644 (file)
index 0000000..2aabe84
--- /dev/null
@@ -0,0 +1,612 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <cassert>
+
+#include "Ucache.h"
+#include "nuca.h"
+
+unsigned int MIN_BANKSIZE=65536;
+#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
+#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
+#define CONTR_2_BANK_LAT 0
+
+int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
+
+  Nuca::Nuca(
+      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
+      ):deviceType(dt)
+{
+  init_cont();
+}
+
+void
+Nuca::init_cont()
+{
+  FILE *cont;
+  char line[5000];
+  char jk[5000];
+  cont = fopen("contention.dat", "r");
+  if (!cont) {
+    cout << "contention.dat file is missing!\n";
+    exit(0);
+  }
+
+  for(int i=0; i<2; i++) {
+    for(int j=2; j<5; j++) {
+      for(int k=0; k<ROUTER_TYPES; k++) {
+        for(int l=0;l<7; l++) {
+          int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
+          assert(fscanf(cont, "%[^\n]\n", line) != EOF);
+          sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3],
+              &temp[4], &temp[5], &temp[6], &temp[7]);
+        }
+      }
+    }
+  }
+  fclose(cont);
+}
+
+  void
+Nuca::print_cont_stats()
+{
+  for(int i=0; i<2; i++) {
+    for(int j=2; j<5; j++) {
+      for(int k=0; k<ROUTER_TYPES; k++) {
+        for(int l=0;l<7; l++) {
+          for(int m=0;l<7; l++) {
+            cout << cont_stats[i][j][k][l][m] << " ";
+          }
+          cout << endl;
+        }
+      }
+    }
+  }
+  cout << endl;
+}
+
+Nuca::~Nuca(){
+  for (int i = wt_min; i <= wt_max; i++) {
+    delete wire_vertical[i];
+    delete wire_horizontal[i];
+  }
+}
+
+/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
+  int
+Nuca::calc_cycles(double lat, double oper_freq)
+{
+  //TODO: convert latch delay to FO4 */
+  double cycle_time = (1.0/(oper_freq*1e9)); /*s*/
+  cycle_time -= LATCH_DELAY;
+  cycle_time -= FIXED_OVERHEAD;
+
+  return (int)ceil(lat/cycle_time);
+}
+
+
+nuca_org_t::~nuca_org_t() {
+  // if(h_wire) delete h_wire;
+  // if(v_wire) delete v_wire;
+  // if(router) delete router;
+}
+
+/*
+ * Version - 6.0
+ *
+ * Perform exhaustive search across different bank organizatons,
+ * router configurations, grid organizations, and wire models and
+ * find an optimal NUCA organization
+ * For different bank count values
+ * 1. Optimal bank organization is calculated
+ * 2. For each bank organization, find different NUCA organizations
+ *    using various router configurations, grid organizations,
+ *    and wire models.
+ * 3. NUCA model with the least cost is picked for
+ *    this particular bank count
+ * Finally include contention statistics and find the optimal
+ *    NUCA configuration
+ */
+  void
+Nuca::sim_nuca()
+{
+  /* temp variables */
+  int it, ro, wr;
+  int num_cyc;
+  unsigned int i, j, k;
+  unsigned int r, c;
+  int l2_c;
+  int bank_count = 0;
+  uca_org_t ures;
+  nuca_org_t *opt_n;
+  mem_array tag, data;
+  list<nuca_org_t *> nuca_list;
+  Router *router_s[ROUTER_TYPES];
+  router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
+  router_s[0]->print_router();
+  router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
+  router_s[1]->print_router();
+  router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
+  router_s[2]->print_router();
+
+  int core_in; // to store no. of cores
+
+  /* to search diff grid organizations */
+  double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
+         curr_acclat;
+  double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
+         avg_leakage_power;
+
+  double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF;
+  int opt_rows = 0;
+  int opt_columns = 0;
+  double opt_totno_hops = 0;
+  double opt_avg_hop = 0;
+  double opt_dyn_power = 0, opt_leakage_power = 0;
+  min_values_t minval;
+
+  int bank_start = 0;
+
+  int flit_width = 0;
+
+  /* vertical and horizontal hop latency values */
+  int ver_hop_lat, hor_hop_lat; /* in cycles */
+
+
+  /* no. of different bank sizes to consider */
+  int iterations;
+
+
+  g_ip->nuca_cache_sz = g_ip->cache_sz;
+  nuca_list.push_back(new nuca_org_t());
+
+  if (g_ip->cache_level == 0) l2_c = 1;
+  else l2_c = 0;
+
+  if (g_ip->cores <= 4) core_in = 2;
+  else if (g_ip->cores <= 8) core_in = 3;
+  else if (g_ip->cores <= 16) core_in = 4;
+  else {cout << "Number of cores should be <= 16!\n"; exit(0);}
+
+
+  // set the lower bound to an appropriate value. this depends on cache associativity
+  if (g_ip->assoc > 2) {
+    i = 2;
+    while (i != g_ip->assoc) {
+      MIN_BANKSIZE *= 2;
+      i *= 2;
+    }
+  }
+
+  iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);
+
+  if (g_ip->force_wiretype)
+  {
+    if (g_ip->wt == Low_swing) {
+      wt_min = Low_swing;
+      wt_max = Low_swing;
+    }
+    else {
+      wt_min = Global;
+      wt_max = Low_swing-1;
+    }
+  }
+  else {
+    wt_min = Global;
+    wt_max = Low_swing;
+  }
+  if (g_ip->nuca_bank_count != 0) { // simulate just one bank
+    if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
+        g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
+        g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
+      fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
+    }
+    bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
+    iterations = bank_start+1;
+    g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
+  }
+  cout << "Simulating various NUCA configurations\n";
+  for (it=bank_start; it<iterations; it++) { /* different bank count values */
+    ures.tag_array2 = &tag;
+    ures.data_array2 = &data;
+    /*
+     * find the optimal bank organization
+     */
+    solve(&ures);
+//    output_UCA(&ures);
+    bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
+    cout << "====" <<  g_ip->cache_sz << "\n";
+
+    for (wr=wt_min; wr<=wt_max; wr++) {
+
+      for (ro=0; ro<ROUTER_TYPES; ro++)
+      {
+        flit_width = (int) router_s[ro]->flit_size; //initialize router
+        nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
+
+        /* calculate router and wire parameters */
+
+        double vlength = ures.cache_ht; /* length of the wire (u)*/
+        double hlength = ures.cache_len; // u
+
+        /* find delay, area, and power for wires */
+        wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
+        wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
+
+
+        hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
+            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
+        ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
+            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
+
+        /*
+         * assume a grid like topology and explore for optimal network
+         * configuration using different row and column count values.
+         */
+        for (c=1; c<=(unsigned int)bank_count; c++) {
+          while (bank_count%c != 0) c++;
+          r = bank_count/c;
+
+          /*
+           * to find the avg access latency of a NUCA cache, uncontended
+           * access time to each bank from the
+           * cache controller is calculated.
+           * avg latency =
+           * sum of the access latencies to individual banks)/bank
+           * count value.
+           */
+          totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
+          k = 1;
+          for (i=0; i<r; i++) {
+            for (j=0; j<c; j++) {
+              /*
+               * vertical hops including the
+               * first hop from the cache controller
+               */
+              curr_hop = i + 1;
+              curr_hop += j; /* horizontal hops */
+              totno_hhops += j;
+              totno_vhops += (i+1);
+              curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
+                  j * hor_hop_lat);
+
+              tot_lat += curr_acclat;
+              totno_hops += curr_hop;
+            }
+          }
+          avg_lat = tot_lat/bank_count;
+          avg_hop = totno_hops/bank_count;
+          avg_hhop = totno_hhops/bank_count;
+          avg_vhop = totno_vhops/bank_count;
+
+          /* net access latency */
+          curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
+            calc_cycles(ures.access_time,
+                1/(nuca_list.back()->nuca_pda.cycle_time*.001));
+
+          /* avg access lat of nuca */
+          avg_dyn_power =
+            avg_hop *
+            (router_s[ro]->power.readOp.dynamic) + avg_hhop *
+            (wire_horizontal[wr]->power.readOp.dynamic) *
+            (g_ip->block_sz*8 + 64) + avg_vhop *
+            (wire_vertical[wr]->power.readOp.dynamic) *
+            (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;
+
+          avg_leakage_power =
+            bank_count * router_s[ro]->power.readOp.leakage +
+            avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
+                wire_horizontal[wr]->delay) * flit_width +
+            avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
+                wire_horizontal[wr]->delay);
+
+          if (curr_acclat < opt_acclat) {
+            opt_acclat = curr_acclat;
+            opt_tot_lat = tot_lat;
+            opt_avg_lat = avg_lat;
+            opt_totno_hops = totno_hops;
+            opt_avg_hop = avg_hop;
+            opt_rows = r;
+            opt_columns = c;
+            opt_dyn_power = avg_dyn_power;
+            opt_leakage_power = avg_leakage_power;
+          }
+          totno_hops = 0;
+          tot_lat = 0;
+          totno_hhops = 0;
+          totno_vhops = 0;
+        }
+        nuca_list.back()->wire_pda.power.readOp.dynamic =
+          opt_avg_hop * flit_width *
+          (wire_horizontal[wr]->power.readOp.dynamic +
+           wire_vertical[wr]->power.readOp.dynamic);
+        nuca_list.back()->avg_hops = opt_avg_hop;
+        /* network delay/power */
+        nuca_list.back()->h_wire = wire_horizontal[wr];
+        nuca_list.back()->v_wire = wire_vertical[wr];
+        nuca_list.back()->router = router_s[ro];
+        /* bank delay/power */
+
+        nuca_list.back()->bank_pda.delay = ures.access_time;
+        nuca_list.back()->bank_pda.power = ures.power;
+        nuca_list.back()->bank_pda.area.h = ures.cache_ht;
+        nuca_list.back()->bank_pda.area.w = ures.cache_len;
+        nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
+
+        num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
+            1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
+        if(num_cyc%2 != 0) num_cyc++;
+        if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
+
+        if (it < 7) {
+          nuca_list.back()->nuca_pda.delay = opt_acclat +
+            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
+          nuca_list.back()->contention =
+            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
+        }
+        else {
+          nuca_list.back()->nuca_pda.delay = opt_acclat +
+            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
+          nuca_list.back()->contention =
+            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
+        }
+        nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
+        nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
+
+        /* array organization */
+        nuca_list.back()->bank_count = bank_count;
+        nuca_list.back()->rows = opt_rows;
+        nuca_list.back()->columns = opt_columns;
+        calculate_nuca_area (nuca_list.back());
+
+        minval.update_min_values(nuca_list.back());
+        nuca_list.push_back(new nuca_org_t());
+        opt_acclat = BIGNUM;
+
+      }
+    }
+    g_ip->cache_sz /= 2;
+  }
+
+  delete(nuca_list.back());
+  nuca_list.pop_back();
+  opt_n = find_optimal_nuca(&nuca_list, &minval);
+  print_nuca(opt_n);
+  g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;
+
+  list<nuca_org_t *>::iterator niter;
+  for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
+  {
+    delete *niter;
+  }
+  nuca_list.clear();
+
+  for(int i=0; i < ROUTER_TYPES; i++)
+  {
+    delete router_s[i];
+  }
+  g_ip->display_ip();
+  //  g_ip->force_cache_config = true;
+  //  g_ip->ndwl = 8;
+  //  g_ip->ndbl = 16;
+  //  g_ip->nspd = 4;
+  //  g_ip->ndcm = 1;
+  //  g_ip->ndsam1 = 8;
+  //  g_ip->ndsam2 = 32;
+
+}
+
+
+  void
+Nuca::print_nuca (nuca_org_t *fr)
+{
+  printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
+      "----------\n\n");
+  printf("Optimal number of banks - %d\n", fr->bank_count);
+  printf("Grid organization rows x columns - %d x %d\n",
+      fr->rows, fr->columns);
+  printf("Network frequency - %g GHz\n",
+      (1/fr->nuca_pda.cycle_time)*1e3);
+  printf("Cache dimension (mm x mm) - %g x %g\n",
+      fr->nuca_pda.area.h,
+      fr->nuca_pda.area.w);
+
+  fr->router->print_router();
+
+  printf("\n\nWire stats:\n");
+  if (fr->h_wire->wt == Global) {
+    printf("\tWire type - Full swing global wires with least "
+        "possible delay\n");
+  }
+  else if (fr->h_wire->wt == Global_5) {
+    printf("\tWire type - Full swing global wires with "
+        "5%% delay penalty\n");
+  }
+  else if (fr->h_wire->wt == Global_10) {
+    printf("\tWire type - Full swing global wires with "
+        "10%% delay penalty\n");
+  }
+  else if (fr->h_wire->wt == Global_20) {
+    printf("\tWire type - Full swing global wires with "
+        "20%% delay penalty\n");
+  }
+  else if (fr->h_wire->wt == Global_30) {
+    printf("\tWire type - Full swing global wires with "
+        "30%% delay penalty\n");
+  }
+  else if(fr->h_wire->wt == Low_swing) {
+    printf("\tWire type - Low swing wires\n");
+  }
+
+  printf("\tHorizontal link delay - %g (ns)\n",
+      fr->h_wire->delay*1e9);
+  printf("\tVertical link delay - %g (ns)\n",
+      fr->v_wire->delay*1e9);
+  printf("\tDelay/length - %g (ns/mm)\n",
+      fr->h_wire->delay*1e9/fr->bank_pda.area.w);
+  printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
+      "\t                       -leakage %g (nW)\n\n",
+      fr->h_wire->power.readOp.dynamic*1e9,
+      fr->h_wire->power.readOp.leakage*1e9);
+  printf("\tVertical link energy -dynamic/access %g (nJ)\n"
+      "\t                     -leakage %g (nW)\n\n",
+      fr->v_wire->power.readOp.dynamic*1e9,
+      fr->v_wire->power.readOp.leakage*1e9);
+  printf("\n\n");
+  fr->v_wire->print_wire();
+  printf("\n\nBank stats:\n");
+}
+
+
+  nuca_org_t *
+Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval)
+{
+  double cost = 0;
+  double min_cost = BIGNUM;
+  nuca_org_t *res = NULL;
+  float d, a, dp, lp, c;
+  int v;
+  dp = g_ip->dynamic_power_wt_nuca;
+  lp = g_ip->leakage_power_wt_nuca;
+  a = g_ip->area_wt_nuca;
+  d = g_ip->delay_wt_nuca;
+  c = g_ip->cycle_time_wt_nuca;
+
+  list<nuca_org_t *>::iterator niter;
+
+
+  for (niter = n->begin(); niter != n->end(); niter++) {
+    fprintf(stderr, "\n-----------------------------"
+        "---------------\n");
+
+
+    printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
+        "bank_dpower = %g \tleak = %g \tcycle = %g\n",
+        (*niter)->bank_count,
+        (*niter)->nuca_pda.delay,
+        (*niter)->nuca_pda.power.readOp.dynamic,
+        (*niter)->h_wire->wt,
+        (*niter)->bank_pda.power.readOp.dynamic,
+        (*niter)->nuca_pda.power.readOp.leakage,
+        (*niter)->nuca_pda.cycle_time);
+
+
+    if (g_ip->ed == 1) {
+      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
+        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost) {
+        min_cost = cost;
+        res = ((*niter));
+      }
+    }
+    else if (g_ip->ed == 2) {
+      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
+        ((*niter)->nuca_pda.delay/minval->min_delay)*
+        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost) {
+        min_cost = cost;
+        res = ((*niter));
+      }
+    }
+    else {
+      /*
+       * check whether the current organization
+       * meets the input deviation constraints
+       */
+      v = check_nuca_org((*niter), minval);
+      if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
+
+      if (v) {
+        cost = (d  * ((*niter)->nuca_pda.delay/minval->min_delay) +
+            c  * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) +
+            dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) +
+            lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) +
+            a  * ((*niter)->nuca_pda.area.get_area()/minval->min_area));
+        fprintf(stderr, "cost = %g\n", cost);
+
+        if (min_cost > cost) {
+          min_cost = cost;
+          res = ((*niter));
+        }
+      }
+      else {
+        niter = n->erase(niter);
+        if (niter !=n->begin())
+                niter --;
+      }
+    }
+  }
+  return res;
+}
+
+  int
+Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval)
+{
+  if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
+      g_ip->dynamic_power_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
+      g_ip->leakage_power_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
+      g_ip->cycle_time_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 >
+      g_ip->area_dev_nuca) {
+    return 0;
+  }
+  return 1;
+}
+
+  void
+Nuca::calculate_nuca_area (nuca_org_t *nuca)
+{
+  nuca->nuca_pda.area.h=
+    nuca->rows * ((nuca->h_wire->wire_width +
+          nuca->h_wire->wire_spacing)
+        * nuca->router->flit_size +
+        nuca->bank_pda.area.h);
+
+  nuca->nuca_pda.area.w =
+    nuca->columns * ((nuca->v_wire->wire_width +
+          nuca->v_wire->wire_spacing)
+        * nuca->router->flit_size +
+        nuca->bank_pda.area.w);
+}
+
diff --git a/ext/mcpat/cacti/nuca.h b/ext/mcpat/cacti/nuca.h
new file mode 100644 (file)
index 0000000..adfe325
--- /dev/null
@@ -0,0 +1,100 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __NUCA_H__
+#define __NUCA_H__
+
+#include <iostream>
+
+#include "assert.h"
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "io.h"
+#include "mat.h"
+#include "parameter.h"
+#include "router.h"
+#include "wire.h"
+
+class nuca_org_t {
+  public:
+  ~nuca_org_t();
+//    int size;
+    /* area, power, access time, and cycle time stats */
+    Component nuca_pda;
+    Component bank_pda;
+    Component wire_pda;
+    Wire *h_wire;
+    Wire *v_wire;
+    Router *router;
+    /* for particular network configuration
+     * calculated based on a cycle accurate
+     * simulation Ref: CACTI 6 - Tech report
+     */
+    double contention;
+
+    /* grid network stats */
+    double avg_hops;
+    int rows;
+    int columns;
+    int bank_count;
+};
+
+
+
+class Nuca : public Component
+{
+  public:
+    Nuca(
+        TechnologyParameter::DeviceType *dt);
+    void print_router();
+    ~Nuca();
+    void sim_nuca();
+    void init_cont();
+    int calc_cycles(double lat, double oper_freq);
+    void calculate_nuca_area (nuca_org_t *nuca);
+    int check_nuca_org (nuca_org_t *n, min_values_t *minval);
+    nuca_org_t * find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval);
+    void print_nuca(nuca_org_t *n);
+    void print_cont_stats();
+
+  private:
+
+    TechnologyParameter::DeviceType *deviceType;
+    int wt_min, wt_max;
+    Wire *wire_vertical[WIRE_TYPES],
+         *wire_horizontal[WIRE_TYPES];
+
+};
+
+
+#endif
diff --git a/ext/mcpat/cacti/parameter.cc b/ext/mcpat/cacti/parameter.cc
new file mode 100644 (file)
index 0000000..b71640c
--- /dev/null
@@ -0,0 +1,713 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "area.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+InputParameter * g_ip;
+TechnologyParameter g_tp;
+
+
+
+void TechnologyParameter::DeviceType::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "C_g_ideal = " << setw(12) << C_g_ideal << " F/um" << endl;
+  cout << indent_str << "C_fringe  = " << setw(12) << C_fringe  << " F/um" << endl;
+  cout << indent_str << "C_overlap = " << setw(12) << C_overlap << " F/um" << endl;
+  cout << indent_str << "C_junc    = " << setw(12) << C_junc    << " F/um^2" << endl;
+  cout << indent_str << "l_phy     = " << setw(12) << l_phy     << " um" << endl;
+  cout << indent_str << "l_elec    = " << setw(12) << l_elec    << " um" << endl;
+  cout << indent_str << "R_nch_on  = " << setw(12) << R_nch_on  << " ohm-um" << endl;
+  cout << indent_str << "R_pch_on  = " << setw(12) << R_pch_on  << " ohm-um" << endl;
+  cout << indent_str << "Vdd       = " << setw(12) << Vdd       << " V" << endl;
+  cout << indent_str << "Vth       = " << setw(12) << Vth       << " V" << endl;
+  cout << indent_str << "I_on_n    = " << setw(12) << I_on_n    << " A/um" << endl;
+  cout << indent_str << "I_on_p    = " << setw(12) << I_on_p    << " A/um" << endl;
+  cout << indent_str << "I_off_n   = " << setw(12) << I_off_n   << " A/um" << endl;
+  cout << indent_str << "I_off_p   = " << setw(12) << I_off_p   << " A/um" << endl;
+  cout << indent_str << "C_ox      = " << setw(12) << C_ox      << " F/um^2" << endl;
+  cout << indent_str << "t_ox      = " << setw(12) << t_ox      << " um" << endl;
+  cout << indent_str << "n_to_p_eff_curr_drv_ratio = " << n_to_p_eff_curr_drv_ratio << endl;
+}
+
+
+
+void TechnologyParameter::InterconnectType::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "pitch    = " << setw(12) << pitch    << " um" << endl;
+  cout << indent_str << "R_per_um = " << setw(12) << R_per_um << " ohm/um" << endl;
+  cout << indent_str << "C_per_um = " << setw(12) << C_per_um << " F/um" << endl;
+}
+
+void TechnologyParameter::ScalingFactor::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "logic_scaling_co_eff    = " << setw(12) << logic_scaling_co_eff << endl;
+  cout << indent_str << "curr_core_tx_density = " << setw(12) << core_tx_density << " # of tx/um^2" << endl;
+}
+
+void TechnologyParameter::MemoryType::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "b_w         = " << setw(12) << b_w << " um" << endl;
+  cout << indent_str << "b_h         = " << setw(12) << b_h << " um" << endl;
+  cout << indent_str << "cell_a_w    = " << setw(12) << cell_a_w << " um" << endl;
+  cout << indent_str << "cell_pmos_w = " << setw(12) << cell_pmos_w << " um" << endl;
+  cout << indent_str << "cell_nmos_w = " << setw(12) << cell_nmos_w << " um" << endl;
+  cout << indent_str << "Vbitpre     = " << setw(12) << Vbitpre << " V" << endl;
+}
+
+
+
+void TechnologyParameter::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "ram_wl_stitching_overhead_ = " << setw(12) << ram_wl_stitching_overhead_ << " um" << endl;
+  cout << indent_str << "min_w_nmos_                = " << setw(12) << min_w_nmos_                << " um" << endl;
+  cout << indent_str << "max_w_nmos_                = " << setw(12) << max_w_nmos_                << " um" << endl;
+  cout << indent_str << "unit_len_wire_del          = " << setw(12) << unit_len_wire_del          << " s/um^2" << endl;
+  cout << indent_str << "FO4                        = " << setw(12) << FO4                        << " s" << endl;
+  cout << indent_str << "kinv                       = " << setw(12) << kinv                       << " s" << endl;
+  cout << indent_str << "vpp                        = " << setw(12) << vpp                        << " V" << endl;
+  cout << indent_str << "w_sense_en                 = " << setw(12) << w_sense_en                 << " um" << endl;
+  cout << indent_str << "w_sense_n                  = " << setw(12) << w_sense_n                  << " um" << endl;
+  cout << indent_str << "w_sense_p                  = " << setw(12) << w_sense_p                  << " um" << endl;
+  cout << indent_str << "w_iso                      = " << setw(12) << w_iso                      << " um" << endl;
+  cout << indent_str << "w_poly_contact             = " << setw(12) << w_poly_contact             << " um" << endl;
+  cout << indent_str << "spacing_poly_to_poly       = " << setw(12) << spacing_poly_to_poly       << " um" << endl;
+  cout << indent_str << "spacing_poly_to_contact    = " << setw(12) << spacing_poly_to_contact    << " um" << endl;
+  cout << endl;
+  cout << indent_str << "w_comp_inv_p1              = " << setw(12) << w_comp_inv_p1 << " um" << endl;
+  cout << indent_str << "w_comp_inv_p2              = " << setw(12) << w_comp_inv_p2 << " um" << endl;
+  cout << indent_str << "w_comp_inv_p3              = " << setw(12) << w_comp_inv_p3 << " um" << endl;
+  cout << indent_str << "w_comp_inv_n1              = " << setw(12) << w_comp_inv_n1 << " um" << endl;
+  cout << indent_str << "w_comp_inv_n2              = " << setw(12) << w_comp_inv_n2 << " um" << endl;
+  cout << indent_str << "w_comp_inv_n3              = " << setw(12) << w_comp_inv_n3 << " um" << endl;
+  cout << indent_str << "w_eval_inv_p               = " << setw(12) << w_eval_inv_p  << " um" << endl;
+  cout << indent_str << "w_eval_inv_n               = " << setw(12) << w_eval_inv_n  << " um" << endl;
+  cout << indent_str << "w_comp_n                   = " << setw(12) << w_comp_n      << " um" << endl;
+  cout << indent_str << "w_comp_p                   = " << setw(12) << w_comp_p      << " um" << endl;
+  cout << endl;
+  cout << indent_str << "dram_cell_I_on             = " << setw(12) << dram_cell_I_on << " A/um" << endl;
+  cout << indent_str << "dram_cell_Vdd              = " << setw(12) << dram_cell_Vdd  << " V" << endl;
+  cout << indent_str << "dram_cell_I_off_worst_case_len_temp = " << setw(12) << dram_cell_I_off_worst_case_len_temp << " A/um" << endl;
+  cout << indent_str << "dram_cell_C                = " << setw(12) << dram_cell_C               << " F" << endl;
+  cout << indent_str << "gm_sense_amp_latch         = " << setw(12) << gm_sense_amp_latch        << " F/s" << endl;
+  cout << endl;
+  cout << indent_str << "w_nmos_b_mux               = " << setw(12) << w_nmos_b_mux              << " um" << endl;
+  cout << indent_str << "w_nmos_sa_mux              = " << setw(12) << w_nmos_sa_mux             << " um" << endl;
+  cout << indent_str << "w_pmos_bl_precharge        = " << setw(12) << w_pmos_bl_precharge       << " um" << endl;
+  cout << indent_str << "w_pmos_bl_eq               = " << setw(12) << w_pmos_bl_eq              << " um" << endl;
+  cout << indent_str << "MIN_GAP_BET_P_AND_N_DIFFS  = " << setw(12) << MIN_GAP_BET_P_AND_N_DIFFS << " um" << endl;
+  cout << indent_str << "HPOWERRAIL                 = " << setw(12) << HPOWERRAIL                << " um" << endl;
+  cout << indent_str << "cell_h_def                 = " << setw(12) << cell_h_def                << " um" << endl;
+
+  cout << endl;
+  cout << indent_str << "SRAM cell transistor: " << endl;
+  sram_cell.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "DRAM access transistor: " << endl;
+  dram_acc.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "DRAM wordline transistor: " << endl;
+  dram_wl.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "peripheral global transistor: " << endl;
+  peri_global.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "wire local" << endl;
+  wire_local.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "wire inside mat" << endl;
+  wire_inside_mat.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "wire outside mat" << endl;
+  wire_outside_mat.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "SRAM" << endl;
+  sram.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "DRAM" << endl;
+  dram.display(indent + 2);
+}
+
+
+DynamicParameter::DynamicParameter():
+  use_inp_params(0), cell(), is_valid(true)
+{
+}
+
+
+
+DynamicParameter::DynamicParameter(
+    bool is_tag_,
+    int pure_ram_,
+    int pure_cam_,
+    double Nspd_,
+    unsigned int Ndwl_,
+    unsigned int Ndbl_,
+    unsigned int Ndcm_,
+    unsigned int Ndsam_lev_1_,
+    unsigned int Ndsam_lev_2_,
+    bool is_main_mem_):
+  is_tag(is_tag_), pure_ram(pure_ram_), pure_cam(pure_cam_), tagbits(0), Nspd(Nspd_), Ndwl(Ndwl_), Ndbl(Ndbl_),Ndcm(Ndcm_),
+  Ndsam_lev_1(Ndsam_lev_1_), Ndsam_lev_2(Ndsam_lev_2_),
+  number_way_select_signals_mat(0), V_b_sense(0), use_inp_params(0),
+  is_main_mem(is_main_mem_), cell(), is_valid(false)
+{
+  ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
+  is_dram            = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
+
+  unsigned int capacity_per_die = g_ip->cache_sz / NUMBER_STACKED_DIE_LAYERS;  // capacity per stacked die layer
+  const TechnologyParameter::InterconnectType & wire_local = g_tp.wire_local;
+  fully_assoc = (g_ip->fully_assoc) ? true : false;
+
+  if (fully_assoc || pure_cam)
+  { // fully-assocative cache -- ref: CACTi 2.0 report
+          if (Ndwl != 1 ||            //Ndwl is fixed to 1 for FA
+                          Ndcm != 1 ||            //Ndcm is fixed to 1 for FA
+                          Nspd < 1 || Nspd > 1 || //Nspd is fixed to 1 for FA
+                          Ndsam_lev_1 != 1 ||     //Ndsam_lev_1 is fixed to one
+                          Ndsam_lev_2 != 1 ||     //Ndsam_lev_2 is fixed to one
+                          Ndbl < 2)
+          {
+          return;
+          }
+  }
+
+  if ((is_dram) && (!is_tag) && (Ndcm > 1))
+  {
+          return;  // For a DRAM array, each bitline has its own sense-amp
+  }
+
+  // If it's not an FA tag/data array, Ndwl should be at least two and Ndbl should be
+  // at least two because an array is assumed to have at least one mat. And a mat
+  // is formed out of two horizontal subarrays and two vertical subarrays
+  if (fully_assoc == false && (Ndwl < 1 || Ndbl < 1))
+  {
+          return;
+  }
+
+  //***********compute row, col of an subarray
+  if (!(fully_assoc || pure_cam))//Not fully_asso nor cam
+  {
+          // if data array, let tagbits = 0
+          if (is_tag)
+          {
+                  if (g_ip->specific_tag)
+                  {
+                          tagbits = g_ip->tag_w;
+                  }
+                  else
+                  {
+                          tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(capacity_per_die) +
+                          _log2(g_ip->tag_assoc*2 - 1) - _log2(g_ip->nbanks);
+
+                  }
+                  tagbits = (((tagbits + 3) >> 2) << 2);
+
+                  num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
+                                  g_ip->block_sz * g_ip->tag_assoc * Ndbl * Nspd));// + EPSILON);
+                  num_c_subarray = (int)ceil((tagbits * g_ip->tag_assoc * Nspd / Ndwl));// + EPSILON);
+                  //burst_length = 1;
+          }
+          else
+          {
+                  num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
+                                  g_ip->block_sz * g_ip->data_assoc * Ndbl * Nspd));// + EPSILON);
+                  num_c_subarray = (int)ceil((8 * g_ip->block_sz * g_ip->data_assoc * Nspd / Ndwl));// + EPSILON); + EPSILON);
+                  // burst_length = g_ip->block_sz * 8 / g_ip->out_w;
+          }
+
+          if (num_r_subarray < MINSUBARRAYROWS) return;
+          if (num_r_subarray == 0) return;
+          if (num_r_subarray > MAXSUBARRAYROWS) return;
+          if (num_c_subarray < MINSUBARRAYCOLS) return;
+          if (num_c_subarray > MAXSUBARRAYCOLS) return;
+
+  }
+
+  else
+  {//either fully-asso or cam
+          if (pure_cam)
+          {
+                  if (g_ip->specific_tag)
+                  {
+                          tagbits = int(ceil(g_ip->tag_w/8.0)*8);
+                  }
+                  else
+                  {
+                          tagbits = int(ceil((ADDRESS_BITS + EXTRA_TAG_BITS)/8.0)*8);
+//                       cout<<"Pure CAM needs tag width to be specified"<<endl;
+//                       exit(0);
+                  }
+                  //tagbits = (((tagbits + 3) >> 2) << 2);
+
+                  tag_num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks*tagbits/8.0 * Ndbl));//TODO: error check input of tagbits and blocksize //TODO: for pure CAM, g_ip->block should be number of entries.
+                  //tag_num_c_subarray = (int)(tagbits  + EPSILON);
+                  tag_num_c_subarray = tagbits;
+                  if (tag_num_r_subarray == 0) return;
+                  if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
+                  if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
+                  if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
+                  num_r_subarray = tag_num_r_subarray;
+          }
+          else //fully associative
+          {
+                  if (g_ip->specific_tag)
+                  {
+                          tagbits = g_ip->tag_w;
+                  }
+                  else
+                  {
+                          tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(g_ip->block_sz);//TODO: should be the page_offset=log2(page size), but this info is not avail with CACTI, for McPAT this is no problem.
+                  }
+                  tagbits = (((tagbits + 3) >> 2) << 2);
+
+                  tag_num_r_subarray = (int)(capacity_per_die / (g_ip->nbanks*g_ip->block_sz * Ndbl));
+                  tag_num_c_subarray = (int)ceil((tagbits * Nspd / Ndwl));// + EPSILON);
+                  if (tag_num_r_subarray == 0) return;
+                  if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
+                  if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
+                  if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
+
+                  data_num_r_subarray = tag_num_r_subarray;
+                  data_num_c_subarray = 8 * g_ip->block_sz;
+                  if (data_num_r_subarray == 0) return;
+                  if (data_num_r_subarray > MAXSUBARRAYROWS) return;
+                  if (data_num_c_subarray < MINSUBARRAYCOLS) return;
+                  if (data_num_c_subarray > MAXSUBARRAYCOLS) return;
+                  num_r_subarray = tag_num_r_subarray;
+          }
+  }
+
+  num_subarrays = Ndwl * Ndbl;
+  //****************end of computation of row, col of an subarray
+
+  // calculate wire parameters
+  if (fully_assoc || pure_cam)
+  {
+          cam_cell.h = g_tp.cam.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
+          + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
+          cam_cell.w = g_tp.cam.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
+          + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
+
+          cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +g_ip->num_rw_ports-1 + g_ip->num_rd_ports)
+          + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
+          cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports -1 + (g_ip->num_rd_ports - g_ip->num_se_rd_ports)
+                          + g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
+  }
+  else
+  {
+          if(is_tag)
+          {
+                  cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_rd_ports +
+                                  g_ip->num_wr_ports);
+                  cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_wr_ports +
+                                  (g_ip->num_rd_ports - g_ip->num_se_rd_ports)) +
+                                  wire_local.pitch * g_ip->num_se_rd_ports;
+          }
+          else
+          {
+                  if (is_dram)
+                  {
+                          cell.h = g_tp.dram.b_h;
+                          cell.w = g_tp.dram.b_w;
+                  }
+                  else
+                  {
+                          cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +
+                                          g_ip->num_rw_ports - 1 + g_ip->num_rd_ports);
+                          cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 +
+                                          (g_ip->num_rd_ports - g_ip->num_se_rd_ports) +
+                                          g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports;
+                  }
+          }
+  }
+
+  double c_b_metal = cell.h * wire_local.C_per_um;
+  double C_bl;
+
+  if (!(fully_assoc || pure_cam))
+  {
+          if (is_dram)
+          {
+                  deg_bl_muxing = 1;
+                  if (ram_cell_tech_type == comm_dram)
+                  {
+                          C_bl  = num_r_subarray * c_b_metal;
+                          V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C / (g_tp.dram_cell_C + C_bl);
+                          if (V_b_sense < VBITSENSEMIN)
+                          {
+                                  return;
+                          }
+                          V_b_sense = VBITSENSEMIN;  // in any case, we fix sense amp input signal to a constant value
+                          dram_refresh_period = 64e-3;
+                  }
+                  else
+                  {
+                          double Cbitrow_drain_cap = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;
+                          C_bl  = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
+                          V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl);
+
+                          if (V_b_sense < VBITSENSEMIN)
+                          {
+                                  return; //Sense amp input signal is smaller that minimum allowable sense amp input signal
+                          }
+                          V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value
+                          //v_storage_worst = g_tp.dram_cell_Vdd / 2 - VBITSENSEMIN * (g_tp.dram_cell_C + C_bl) / g_tp.dram_cell_C;
+                          //dram_refresh_period = 1.1 * g_tp.dram_cell_C * v_storage_worst / g_tp.dram_cell_I_off_worst_case_len_temp;
+                          dram_refresh_period = 0.9 * g_tp.dram_cell_C * VDD_STORAGE_LOSS_FRACTION_WORST * g_tp.dram_cell_Vdd / g_tp.dram_cell_I_off_worst_case_len_temp;
+                  }
+          }
+          else
+          { //SRAM
+                  V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
+                  deg_bl_muxing = Ndcm;
+                  // "/ 2.0" below is due to the fact that two adjacent access transistors share drain
+                  // contacts in a physical layout
+                  double Cbitrow_drain_cap = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;
+                  C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
+                  dram_refresh_period = 0;
+          }
+  }
+  else
+  {
+          c_b_metal = cam_cell.h * wire_local.C_per_um;//IBM and SUN design, SRAM array uses dummy cells to fill the blank space due to mismatch on CAM-RAM
+          V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
+          deg_bl_muxing = 1;//FA fix as 1
+          // "/ 2.0" below is due to the fact that two adjacent access transistors share drain
+          // contacts in a physical layout
+          double Cbitrow_drain_cap = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;//TODO: comment out these two lines
+          C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
+          dram_refresh_period = 0;
+  }
+
+
+  // do/di: data in/out, for fully associative they are the data width for normal read and write
+  // so/si: search data in/out, for fully associative they are the data width for the search ops
+  // for CAM, si=di, but so = matching address. do = data out = di (for normal read/write)
+  // so/si needs broadcase while do/di do not
+
+  if (fully_assoc || pure_cam)
+  {
+            switch (Ndbl) {
+              case (0):
+                cout <<  "   Invalid Ndbl \n"<<endl;
+                exit(0);
+                break;
+              case (1):
+                  num_mats_h_dir = 1;//one subarray per mat
+                  num_mats_v_dir = 1;
+                break;
+              case (2):
+                  num_mats_h_dir = 1;//two subarrays per mat
+                  num_mats_v_dir = 1;
+                  break;
+              default:
+                  num_mats_h_dir = int(floor(sqrt(Ndbl/4.0)));//4 subbarrys per mat
+                  num_mats_v_dir = int(Ndbl/4.0 / num_mats_h_dir);
+            }
+            num_mats = num_mats_h_dir * num_mats_v_dir;
+
+            if (fully_assoc)
+            {
+                num_so_b_mat   = data_num_c_subarray;
+                num_do_b_mat   = data_num_c_subarray + tagbits;
+            }
+            else
+            {
+                num_so_b_mat = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
+                num_do_b_mat = tagbits;
+            }
+  }
+  else
+  {
+          num_mats_h_dir = MAX(Ndwl / 2, 1);
+          num_mats_v_dir = MAX(Ndbl / 2, 1);
+          num_mats       = num_mats_h_dir * num_mats_v_dir;
+          num_do_b_mat   = MAX((num_subarrays/num_mats) * num_c_subarray / (deg_bl_muxing * Ndsam_lev_1 * Ndsam_lev_2), 1);
+  }
+
+  if (!(fully_assoc|| pure_cam) && (num_do_b_mat < (num_subarrays/num_mats)))
+  {
+          return;
+  }
+
+
+  int deg_sa_mux_l1_non_assoc;
+  //TODO:the i/o for subbank is not necessary and should be removed.
+  if (!(fully_assoc || pure_cam))
+  {
+          if (!is_tag)
+          {
+                  if (is_main_mem == true)
+                  {
+                          num_do_b_subbank = g_ip->int_prefetch_w * g_ip->out_w;
+                          deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
+                  }
+                  else
+                  {
+                          if (g_ip->fast_access == true)
+                          {
+                                  num_do_b_subbank = g_ip->out_w * g_ip->data_assoc;
+                                  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
+                          }
+                          else
+                          {
+
+                                  num_do_b_subbank = g_ip->out_w;
+                                  deg_sa_mux_l1_non_assoc = Ndsam_lev_1 / g_ip->data_assoc;
+                                  if (deg_sa_mux_l1_non_assoc < 1)
+                                  {
+                                          return;
+                                  }
+
+                          }
+                  }
+          }
+          else
+          {
+                  num_do_b_subbank = tagbits * g_ip->tag_assoc;
+                  if (num_do_b_mat < tagbits)
+                  {
+                          return;
+                  }
+                  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
+                  //num_do_b_mat = g_ip->tag_assoc / num_mats_h_dir;
+          }
+  }
+  else
+  {
+          if (fully_assoc)
+          {
+                  num_so_b_subbank = 8 * g_ip->block_sz;//TODO:internal perfetch should be considered also for fa
+                  num_do_b_subbank = num_so_b_subbank + tag_num_c_subarray;
+          }
+          else
+          {
+                  num_so_b_subbank = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
+                  num_do_b_subbank = tag_num_c_subarray;
+          }
+
+          deg_sa_mux_l1_non_assoc = 1;
+  }
+
+  deg_senseamp_muxing_non_associativity = deg_sa_mux_l1_non_assoc;
+
+  if (fully_assoc || pure_cam)
+  {
+          num_act_mats_hor_dir = 1;
+          num_act_mats_hor_dir_sl = num_mats_h_dir;//TODO: this is unnecessary, since search op, num_mats is used
+  }
+  else
+  {
+          num_act_mats_hor_dir = num_do_b_subbank / num_do_b_mat;
+          if (num_act_mats_hor_dir == 0)
+          {
+                  return;
+          }
+  }
+
+  //compute num_do_mat for tag
+  if (is_tag)
+  {
+          if (!(fully_assoc || pure_cam))
+          {
+                  num_do_b_mat     = g_ip->tag_assoc / num_act_mats_hor_dir;
+                  num_do_b_subbank = num_act_mats_hor_dir * num_do_b_mat;
+          }
+  }
+
+  if ((g_ip->is_cache == false && is_main_mem == true) || (PAGE_MODE == 1 && is_dram))
+  {
+          if (num_act_mats_hor_dir * num_do_b_mat * Ndsam_lev_1 * Ndsam_lev_2 != (int)g_ip->page_sz_bits)
+          {
+                  return;
+          }
+  }
+
+//  if (is_tag == false && g_ip->is_cache == true && !fully_assoc && !pure_cam && //TODO: TODO burst transfer should also apply to RAM arrays
+  if (is_tag == false && g_ip->is_main_mem == true &&
+                  num_act_mats_hor_dir*num_do_b_mat*Ndsam_lev_1*Ndsam_lev_2 < ((int) g_ip->out_w * (int) g_ip->burst_len * (int) g_ip->data_assoc))
+  {
+          return;
+  }
+
+  if (num_act_mats_hor_dir > num_mats_h_dir)
+  {
+          return;
+  }
+
+
+  //compute di for mat subbank and bank
+  if (!(fully_assoc ||pure_cam))
+  {
+          if(!is_tag)
+          {
+                  if(g_ip->fast_access == true)
+                  {
+                          num_di_b_mat = num_do_b_mat / g_ip->data_assoc;
+                  }
+                  else
+                  {
+                          num_di_b_mat = num_do_b_mat;
+                  }
+          }
+          else
+          {
+                  num_di_b_mat = tagbits;
+          }
+  }
+  else
+  {
+          if (fully_assoc)
+          {
+                  num_di_b_mat = num_do_b_mat;
+                  //*num_subarrays/num_mats; bits per mat of CAM/FA is as same as cache,
+                  //but inside the mat wire tracks need to be reserved for search data bus
+                  num_si_b_mat = tagbits;
+          }
+          else
+          {
+                  num_di_b_mat = tagbits;
+                  num_si_b_mat = tagbits;//*num_subarrays/num_mats;
+          }
+
+  }
+
+  num_di_b_subbank       = num_di_b_mat * num_act_mats_hor_dir;//normal cache or normal r/w for FA
+  num_si_b_subbank       = num_si_b_mat; //* num_act_mats_hor_dir_sl; inside the data is broadcast
+
+  int num_addr_b_row_dec     = _log2(num_r_subarray);
+  if  ((fully_assoc ||pure_cam))
+          num_addr_b_row_dec     +=_log2(num_subarrays/num_mats);
+  int number_subbanks        = num_mats / num_act_mats_hor_dir;
+  number_subbanks_decode = _log2(number_subbanks);//TODO: add log2(num_subarray_per_bank) to FA/CAM
+
+  num_rw_ports = g_ip->num_rw_ports;
+  num_rd_ports = g_ip->num_rd_ports;
+  num_wr_ports = g_ip->num_wr_ports;
+  num_se_rd_ports = g_ip->num_se_rd_ports;
+  num_search_ports = g_ip->num_search_ports;
+
+  if (is_dram && is_main_mem)
+  {
+          number_addr_bits_mat = MAX((unsigned int) num_addr_b_row_dec,
+                          _log2(deg_bl_muxing) + _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2));
+  }
+  else
+  {
+          number_addr_bits_mat = num_addr_b_row_dec + _log2(deg_bl_muxing) +
+          _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2);
+  }
+
+  if (!(fully_assoc ||pure_cam))
+  {
+          if (is_tag)
+          {
+                  num_di_b_bank_per_port = tagbits;
+                  num_do_b_bank_per_port = g_ip->data_assoc;
+          }
+          else
+          {
+                  num_di_b_bank_per_port = g_ip->out_w + g_ip->data_assoc;
+                  num_do_b_bank_per_port = g_ip->out_w;
+          }
+  }
+  else
+  {
+          if (fully_assoc)
+          {
+                  num_di_b_bank_per_port = g_ip->out_w + tagbits;//TODO: out_w or block_sz?
+                  num_si_b_bank_per_port = tagbits;
+                  num_do_b_bank_per_port = g_ip->out_w + tagbits;
+                  num_so_b_bank_per_port = g_ip->out_w;
+          }
+          else
+          {
+                  num_di_b_bank_per_port = tagbits;
+                  num_si_b_bank_per_port = tagbits;
+                  num_do_b_bank_per_port = tagbits;
+                  num_so_b_bank_per_port = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));
+          }
+  }
+
+  if ((!is_tag) && (g_ip->data_assoc > 1) && (!g_ip->fast_access))
+  {
+          number_way_select_signals_mat = g_ip->data_assoc;
+  }
+
+  // add ECC adjustment to all data signals that traverse on H-trees.
+  if (g_ip->add_ecc_b_ == true)
+  {
+          num_do_b_mat += (int) (ceil(num_do_b_mat / num_bits_per_ecc_b_));
+          num_di_b_mat += (int) (ceil(num_di_b_mat / num_bits_per_ecc_b_));
+          num_di_b_subbank += (int) (ceil(num_di_b_subbank / num_bits_per_ecc_b_));
+          num_do_b_subbank += (int) (ceil(num_do_b_subbank / num_bits_per_ecc_b_));
+          num_di_b_bank_per_port += (int) (ceil(num_di_b_bank_per_port / num_bits_per_ecc_b_));
+          num_do_b_bank_per_port += (int) (ceil(num_do_b_bank_per_port / num_bits_per_ecc_b_));
+
+          num_so_b_mat += (int) (ceil(num_so_b_mat / num_bits_per_ecc_b_));
+          num_si_b_mat += (int) (ceil(num_si_b_mat / num_bits_per_ecc_b_));
+          num_si_b_subbank += (int) (ceil(num_si_b_subbank / num_bits_per_ecc_b_));
+          num_so_b_subbank += (int) (ceil(num_so_b_subbank / num_bits_per_ecc_b_));
+          num_si_b_bank_per_port += (int) (ceil(num_si_b_bank_per_port / num_bits_per_ecc_b_));
+          num_so_b_bank_per_port += (int) (ceil(num_so_b_bank_per_port / num_bits_per_ecc_b_));
+  }
+
+  is_valid = true;
+}
+
diff --git a/ext/mcpat/cacti/parameter.h b/ext/mcpat/cacti/parameter.h
new file mode 100644 (file)
index 0000000..9c827bb
--- /dev/null
@@ -0,0 +1,367 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __PARAMETER_H__
+#define __PARAMETER_H__
+
+#include "area.h"
+#include "cacti_interface.h"
+#include "const.h"
+#include "io.h"
+
+// parameters which are functions of certain device technology
+class TechnologyParameter
+{
+ public:
+  class DeviceType
+  {
+   public:
+    double C_g_ideal;
+    double C_fringe;
+    double C_overlap;
+    double C_junc;  // C_junc_area
+    double C_junc_sidewall;
+    double l_phy;
+    double l_elec;
+    double R_nch_on;
+    double R_pch_on;
+    double Vdd;
+    double Vth;
+    double I_on_n;
+    double I_on_p;
+    double I_off_n;
+    double I_off_p;
+    double I_g_on_n;
+    double I_g_on_p;
+    double C_ox;
+    double t_ox;
+    double n_to_p_eff_curr_drv_ratio;
+    double long_channel_leakage_reduction;
+
+    DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0),
+                  C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0),
+                  Vdd(0), Vth(0),
+                  I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0),
+                  C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0) { };
+    void reset()
+    {
+      C_g_ideal = 0;
+      C_fringe  = 0;
+      C_overlap = 0;
+      C_junc    = 0;
+      l_phy     = 0;
+      l_elec    = 0;
+      R_nch_on  = 0;
+      R_pch_on  = 0;
+      Vdd       = 0;
+      Vth       = 0;
+      I_on_n    = 0;
+      I_on_p    = 0;
+      I_off_n   = 0;
+      I_off_p   = 0;
+      I_g_on_n   = 0;
+      I_g_on_p   = 0;
+      C_ox      = 0;
+      t_ox      = 0;
+      n_to_p_eff_curr_drv_ratio = 0;
+      long_channel_leakage_reduction = 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+  class InterconnectType
+  {
+   public:
+    double pitch;
+    double R_per_um;
+    double C_per_um;
+    double horiz_dielectric_constant;
+    double vert_dielectric_constant;
+    double aspect_ratio;
+    double miller_value;
+    double ild_thickness;
+
+    InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { };
+
+    void reset()
+    {
+      pitch = 0;
+      R_per_um = 0;
+      C_per_um = 0;
+      horiz_dielectric_constant = 0;
+      vert_dielectric_constant = 0;
+      aspect_ratio = 0;
+      miller_value = 0;
+      ild_thickness = 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+  class MemoryType
+  {
+   public:
+    double b_w;
+    double b_h;
+    double cell_a_w;
+    double cell_pmos_w;
+    double cell_nmos_w;
+    double Vbitpre;
+
+    void reset()
+    {
+      b_w = 0;
+      b_h = 0;
+      cell_a_w = 0;
+      cell_pmos_w = 0;
+      cell_nmos_w = 0;
+      Vbitpre = 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+
+  class ScalingFactor
+  {
+   public:
+    double logic_scaling_co_eff;
+    double core_tx_density;
+    double long_channel_leakage_reduction;
+
+    ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0),
+    long_channel_leakage_reduction(0) { };
+
+    void reset()
+    {
+      logic_scaling_co_eff= 0;
+      core_tx_density = 0;
+      long_channel_leakage_reduction= 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+
+  double ram_wl_stitching_overhead_;
+  double min_w_nmos_;
+  double max_w_nmos_;
+  double max_w_nmos_dec;
+  double unit_len_wire_del;
+  double FO4;
+  double kinv;
+  double vpp;
+  double w_sense_en;
+  double w_sense_n;
+  double w_sense_p;
+  double sense_delay;
+  double sense_dy_power;
+  double w_iso;
+  double w_poly_contact;
+  double spacing_poly_to_poly;
+  double spacing_poly_to_contact;
+
+  double w_comp_inv_p1;
+  double w_comp_inv_p2;
+  double w_comp_inv_p3;
+  double w_comp_inv_n1;
+  double w_comp_inv_n2;
+  double w_comp_inv_n3;
+  double w_eval_inv_p;
+  double w_eval_inv_n;
+  double w_comp_n;
+  double w_comp_p;
+
+  double dram_cell_I_on;
+  double dram_cell_Vdd;
+  double dram_cell_I_off_worst_case_len_temp;
+  double dram_cell_C;
+  double gm_sense_amp_latch;
+
+  double w_nmos_b_mux;
+  double w_nmos_sa_mux;
+  double w_pmos_bl_precharge;
+  double w_pmos_bl_eq;
+  double MIN_GAP_BET_P_AND_N_DIFFS;
+  double MIN_GAP_BET_SAME_TYPE_DIFFS;
+  double HPOWERRAIL;
+  double cell_h_def;
+
+  double chip_layout_overhead;
+  double macro_layout_overhead;
+  double sckt_co_eff;
+
+  double fringe_cap;
+
+  uint64_t h_dec;
+
+  DeviceType sram_cell;   // SRAM cell transistor
+  DeviceType dram_acc;    // DRAM access transistor
+  DeviceType dram_wl;     // DRAM wordline transistor
+  DeviceType peri_global; // peripheral global
+  DeviceType cam_cell;   // SRAM cell transistor
+
+  InterconnectType wire_local;
+  InterconnectType wire_inside_mat;
+  InterconnectType wire_outside_mat;
+
+  ScalingFactor scaling_factor;
+
+  MemoryType sram;
+  MemoryType dram;
+  MemoryType cam;
+
+  void display(uint32_t indent = 0);
+
+  void reset()
+  {
+    dram_cell_Vdd  = 0;
+    dram_cell_I_on = 0;
+    dram_cell_C    = 0;
+    vpp            = 0;
+
+    sense_delay               = 0;
+    sense_dy_power            = 0;
+    fringe_cap                = 0;
+//    horiz_dielectric_constant = 0;
+//    vert_dielectric_constant  = 0;
+//    aspect_ratio              = 0;
+//    miller_value              = 0;
+//    ild_thickness             = 0;
+
+    dram_cell_I_off_worst_case_len_temp = 0;
+
+    sram_cell.reset();
+    dram_acc.reset();
+    dram_wl.reset();
+    peri_global.reset();
+    cam_cell.reset();
+
+    scaling_factor.reset();
+
+    wire_local.reset();
+    wire_inside_mat.reset();
+    wire_outside_mat.reset();
+
+    sram.reset();
+    dram.reset();
+    cam.reset();
+
+    chip_layout_overhead  = 0;
+    macro_layout_overhead = 0;
+    sckt_co_eff           = 0;
+  }
+};
+
+
+
+class DynamicParameter
+{
+  public:
+    bool is_tag;
+    bool pure_ram;
+    bool pure_cam;
+    bool fully_assoc;
+    int tagbits;
+    int num_subarrays;  // only for leakage computation  -- the number of subarrays per bank
+    int num_mats;       // only for leakage computation  -- the number of mats per bank
+    double Nspd;
+    int Ndwl;
+    int Ndbl;
+    int Ndcm;
+    int deg_bl_muxing;
+    int deg_senseamp_muxing_non_associativity;
+    int Ndsam_lev_1;
+    int Ndsam_lev_2;
+    int number_addr_bits_mat;             // per port
+    int number_subbanks_decode;           // per_port
+    int num_di_b_bank_per_port;
+    int num_do_b_bank_per_port;
+    int num_di_b_mat;
+    int num_do_b_mat;
+    int num_di_b_subbank;
+    int num_do_b_subbank;
+
+    int num_si_b_mat;
+    int num_so_b_mat;
+    int num_si_b_subbank;
+    int num_so_b_subbank;
+        int num_si_b_bank_per_port;
+        int num_so_b_bank_per_port;
+
+    int number_way_select_signals_mat;
+    int num_act_mats_hor_dir;
+
+    int num_act_mats_hor_dir_sl;
+    bool is_dram;
+    double V_b_sense;
+    unsigned int num_r_subarray;
+    unsigned int num_c_subarray;
+    int tag_num_r_subarray;//sheng: fully associative cache tag and data must be computed together, data and tag must be separate
+    int tag_num_c_subarray;
+    int data_num_r_subarray;
+    int data_num_c_subarray;
+    int num_mats_h_dir;
+    int num_mats_v_dir;
+    uint32_t ram_cell_tech_type;
+    double dram_refresh_period;
+
+    DynamicParameter();
+    DynamicParameter(
+        bool         is_tag_,
+        int          pure_ram_,
+        int          pure_cam_,
+        double       Nspd_,
+        unsigned int Ndwl_,
+        unsigned int Ndbl_,
+        unsigned int Ndcm_,
+        unsigned int Ndsam_lev_1_,
+        unsigned int Ndsam_lev_2_,
+        bool         is_main_mem_);
+
+    int use_inp_params;
+    unsigned int num_rw_ports;
+    unsigned int num_rd_ports;
+    unsigned int num_wr_ports;
+    unsigned int num_se_rd_ports;  // number of single ended read ports
+    unsigned int num_search_ports;
+    unsigned int out_w;// == nr_bits_out
+    bool   is_main_mem;
+    Area   cell, cam_cell;//cell is the sram_cell in both nomal cache/ram and FA.
+    bool   is_valid;
+};
+
+
+
+extern InputParameter * g_ip;
+extern TechnologyParameter g_tp;
+
+#endif
+
diff --git a/ext/mcpat/cacti/router.cc b/ext/mcpat/cacti/router.cc
new file mode 100644 (file)
index 0000000..06f1706
--- /dev/null
@@ -0,0 +1,311 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "router.h"
+
+Router::Router(
+    double flit_size_,
+    double vc_buf, /* vc size = vc_buffer_size * flit_size */
+    double vc_c,
+    TechnologyParameter::DeviceType *dt,
+    double I_,
+    double O_,
+    double M_
+    ):flit_size(flit_size_),
+      deviceType(dt),
+      I(I_),
+      O(O_),
+      M(M_)
+{
+  vc_buffer_size = vc_buf;
+  vc_count = vc_c;
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  double technology = g_ip->F_sz_um;
+
+  Vdd = dt->Vdd;
+
+  /*Crossbar parameters. Transmisson gate is employed for connector*/
+  NTtr = 10*technology*1e-6/2; /*Transmission gate's nmos tr. length*/
+  PTtr = 20*technology*1e-6/2; /* pmos tr. length*/
+  wt = 15*technology*1e-6/2; /*track width*/
+  ht = 15*technology*1e-6/2; /*track height*/
+//  I = 5; /*Number of crossbar input ports*/
+//  O = 5; /*Number of crossbar output ports*/
+  NTi = 12.5*technology*1e-6/2;
+  PTi = 25*technology*1e-6/2;
+
+  NTid = 60*technology*1e-6/2; //m
+  PTid = 120*technology*1e-6/2; // m
+  NTod = 60*technology*1e-6/2; // m
+  PTod = 120*technology*1e-6/2; // m
+
+  calc_router_parameters();
+}
+
+Router::~Router(){}
+
+
+double //wire cap with triple spacing
+Router::Cw3(double length) {
+  Wire wc(g_ip->wt, length, 1, 3, 3);
+  return (wc.wire_cap(length));
+}
+
+/*Function to calculate the gate capacitance*/
+double
+Router::gate_cap(double w) {
+  return (double) gate_C (w*1e6 /*u*/, 0);
+}
+
+/*Function to calculate the diffusion capacitance*/
+double
+Router::diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/,
+    double s /*number of stacking transistors*/) {
+  return (double) drain_C_(w*1e6 /*u*/, type, (int) s, 1, g_tp.cell_h_def);
+}
+
+
+/*crossbar related functions */
+
+// Model for simple transmission gate
+double
+Router::transmission_buf_inpcap() {
+  return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
+}
+
+double
+Router::transmission_buf_outcap() {
+  return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
+}
+
+double
+Router::transmission_buf_ctrcap() {
+  return gate_cap(NTtr)+gate_cap(PTtr);
+}
+
+double
+Router::crossbar_inpline() {
+  return (Cw3(O*flit_size*wt) + O*transmission_buf_inpcap() + gate_cap(NTid) +
+      gate_cap(PTid) + diff_cap(NTid, 0, 1) + diff_cap(PTid, 1, 1));
+}
+
+double
+Router::crossbar_outline() {
+  return (Cw3(I*flit_size*ht) + I*transmission_buf_outcap() + gate_cap(NTod) +
+      gate_cap(PTod) + diff_cap(NTod, 0, 1) + diff_cap(PTod, 1, 1));
+}
+
+double
+Router::crossbar_ctrline() {
+  return (Cw3(0.5*O*flit_size*wt) + flit_size*transmission_buf_ctrcap() +
+      diff_cap(NTi, 0, 1) + diff_cap(PTi, 1, 1) +
+      gate_cap(NTi) + gate_cap(PTi));
+}
+
+double
+Router::tr_crossbar_power() {
+  return (crossbar_inpline()*Vdd*Vdd*flit_size/2 +
+      crossbar_outline()*Vdd*Vdd*flit_size/2)*2;
+}
+
+void Router::buffer_stats()
+{
+  DynamicParameter dyn_p;
+  dyn_p.is_tag      = false;
+  dyn_p.pure_cam    = false;
+  dyn_p.fully_assoc = false;
+  dyn_p.pure_ram    = true;
+  dyn_p.is_dram     = false;
+  dyn_p.is_main_mem = false;
+  dyn_p.num_subarrays = 1;
+  dyn_p.num_mats = 1;
+  dyn_p.Ndbl = 1;
+  dyn_p.Ndwl = 1;
+  dyn_p.Nspd = 1;
+  dyn_p.deg_bl_muxing = 1;
+  dyn_p.deg_senseamp_muxing_non_associativity = 1;
+  dyn_p.Ndsam_lev_1 = 1;
+  dyn_p.Ndsam_lev_2 = 1;
+  dyn_p.Ndcm = 1;
+  dyn_p.number_addr_bits_mat = 8;
+  dyn_p.number_way_select_signals_mat = 1;
+  dyn_p.number_subbanks_decode = 0;
+  dyn_p.num_act_mats_hor_dir = 1;
+  dyn_p.V_b_sense = Vdd; // FIXME check power calc.
+  dyn_p.ram_cell_tech_type = 0;
+  dyn_p.num_r_subarray = (int) vc_buffer_size;
+  dyn_p.num_c_subarray = (int) flit_size * (int) vc_count;
+  dyn_p.num_mats_h_dir = 1;
+  dyn_p.num_mats_v_dir = 1;
+  dyn_p.num_do_b_subbank = (int)flit_size;
+  dyn_p.num_di_b_subbank = (int)flit_size;
+  dyn_p.num_do_b_mat = (int) flit_size;
+  dyn_p.num_di_b_mat = (int) flit_size;
+  dyn_p.num_do_b_mat = (int) flit_size;
+  dyn_p.num_di_b_mat = (int) flit_size;
+  dyn_p.num_do_b_bank_per_port = (int) flit_size;
+  dyn_p.num_di_b_bank_per_port = (int) flit_size;
+  dyn_p.out_w = (int) flit_size;
+
+  dyn_p.use_inp_params = 1;
+  dyn_p.num_wr_ports = (unsigned int) vc_count;
+  dyn_p.num_rd_ports = 1;//(unsigned int) vc_count;//based on Bill Dally's book
+  dyn_p.num_rw_ports = 0;
+  dyn_p.num_se_rd_ports =0;
+  dyn_p.num_search_ports =0;
+
+
+
+  dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports +
+      dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports);
+  dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 +
+      (dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) +
+      dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports;
+
+  Mat buff(dyn_p);
+  buff.compute_delays(0);
+  buff.compute_power_energy();
+  buffer.power.readOp  = buff.power.readOp;
+  buffer.power.writeOp = buffer.power.readOp; //FIXME
+  buffer.area = buff.area;
+}
+
+
+
+  void
+Router::cb_stats ()
+{
+  if (1) {
+    Crossbar c_b(I, O, flit_size);
+    c_b.compute_power();
+    crossbar.delay = c_b.delay;
+    crossbar.power.readOp.dynamic = c_b.power.readOp.dynamic;
+    crossbar.power.readOp.leakage = c_b.power.readOp.leakage;
+    crossbar.power.readOp.gate_leakage = c_b.power.readOp.gate_leakage;
+    crossbar.area = c_b.area;
+//  c_b.print_crossbar();
+  }
+  else {
+    crossbar.power.readOp.dynamic = tr_crossbar_power();
+    crossbar.power.readOp.leakage = flit_size * I * O *
+        cmos_Isub_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
+    crossbar.power.readOp.gate_leakage = flit_size * I * O *
+        cmos_Ig_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
+  }
+}
+
+void
+Router::get_router_power()
+{
+  /* calculate buffer stats */
+  buffer_stats();
+
+  /* calculate cross-bar stats */
+  cb_stats();
+
+  /* calculate arbiter stats */
+  Arbiter vcarb(vc_count, flit_size, buffer.area.w);
+  Arbiter cbarb(I, flit_size, crossbar.area.w);
+  vcarb.compute_power();
+  cbarb.compute_power();
+  arbiter.power.readOp.dynamic = vcarb.power.readOp.dynamic * I +
+    cbarb.power.readOp.dynamic * O;
+  arbiter.power.readOp.leakage = vcarb.power.readOp.leakage * I +
+    cbarb.power.readOp.leakage * O;
+  arbiter.power.readOp.gate_leakage = vcarb.power.readOp.gate_leakage * I +
+    cbarb.power.readOp.gate_leakage * O;
+
+//  arb_stats();
+  power.readOp.dynamic = ((buffer.power.readOp.dynamic+buffer.power.writeOp.dynamic) +
+                  crossbar.power.readOp.dynamic +
+                  arbiter.power.readOp.dynamic)*MIN(I, O)*M;
+  double pppm_t[4]    = {1,I,I,1};
+  power = power + (buffer.power*pppm_t + crossbar.power + arbiter.power)*pppm_lkg;
+
+}
+
+  void
+Router::get_router_delay ()
+{
+  FREQUENCY=5; // move this to config file --TODO
+  cycle_time = (1/(double)FREQUENCY)*1e3; //ps
+  delay = 4;
+  max_cyc = 17 * g_tp.FO4; //s
+  max_cyc *= 1e12; //ps
+  if (cycle_time < max_cyc) {
+    FREQUENCY = (1/max_cyc)*1e3; //GHz
+  }
+}
+
+  void
+Router::get_router_area()
+{
+  area.h = I*buffer.area.h;
+  area.w = buffer.area.w+crossbar.area.w;
+}
+
+  void
+Router::calc_router_parameters()
+{
+  /* calculate router frequency and pipeline cycles */
+  get_router_delay();
+
+  /* router power stats */
+  get_router_power();
+
+  /* area stats */
+  get_router_area();
+}
+
+  void
+Router::print_router()
+{
+  cout << "\n\nRouter stats:\n";
+  cout << "\tRouter Area - "<< area.get_area()*1e-6<<"(mm^2)\n";
+  cout << "\tMaximum possible network frequency - " << (1/max_cyc)*1e3 << "GHz\n";
+  cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n";
+  cout << "\tNo. of Virtual channels - " << vc_count << "\n";
+  cout << "\tNo. of pipeline stages - " << delay << endl;
+  cout << "\tLink bandwidth - " << flit_size << " (bits)\n";
+  cout << "\tNo. of buffer entries per virtual channel -  "<< vc_buffer_size << "\n";
+  cout << "\tSimple buffer Area - "<< buffer.area.get_area()*1e-6<<"(mm^2)\n";
+  cout << "\tSimple buffer access (Read) - " << buffer.power.readOp.dynamic * 1e9 <<" (nJ)\n";
+  cout << "\tSimple buffer leakage - " << buffer.power.readOp.leakage * 1e3 <<" (mW)\n";
+  cout << "\tCrossbar Area - "<< crossbar.area.get_area()*1e-6<<"(mm^2)\n";
+  cout << "\tCross bar access energy - " << crossbar.power.readOp.dynamic * 1e9<<" (nJ)\n";
+  cout << "\tCross bar leakage power - " << crossbar.power.readOp.leakage * 1e3<<" (mW)\n";
+  cout << "\tArbiter access energy (VC arb + Crossbar arb) - "<<arbiter.power.readOp.dynamic * 1e9 <<" (nJ)\n";
+  cout << "\tArbiter leakage (VC arb + Crossbar arb) - "<<arbiter.power.readOp.leakage * 1e3 <<" (mW)\n";
+
+}
+
diff --git a/ext/mcpat/cacti/router.h b/ext/mcpat/cacti/router.h
new file mode 100644 (file)
index 0000000..72ef449
--- /dev/null
@@ -0,0 +1,115 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __ROUTER_H__
+#define __ROUTER_H__
+
+#include <assert.h>
+
+#include <iostream>
+
+#include "arbiter.h"
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "crossbar.h"
+#include "mat.h"
+#include "parameter.h"
+#include "wire.h"
+
+class Router : public Component
+{
+  public:
+    Router(
+        double flit_size_,
+        double vc_buf, /* vc size = vc_buffer_size * flit_size */
+        double vc_count,
+        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global),
+        double I_ = 5,
+        double O_ = 5,
+        double M_ = 0.6);
+    ~Router();
+
+
+    void print_router();
+
+    Component arbiter, crossbar, buffer;
+
+    double cycle_time, max_cyc;
+    double flit_size;
+    double vc_count;
+    double vc_buffer_size; /* vc size = vc_buffer_size * flit_size */
+
+  private:
+        TechnologyParameter::DeviceType *deviceType;
+        double FREQUENCY; // move this to config file --TODO
+    double Cw3(double len);
+    double gate_cap(double w);
+    double diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, double stack);
+    enum Wire_type wtype;
+    enum Wire_placement wire_placement;
+    //corssbar
+    double NTtr, PTtr, wt, ht, I, O, NTi, PTi, NTid, PTid, NTod, PTod, TriS1, TriS2;
+    double M; //network load
+    double transmission_buf_inpcap();
+    double transmission_buf_outcap();
+    double transmission_buf_ctrcap();
+    double crossbar_inpline();
+    double crossbar_outline();
+    double crossbar_ctrline();
+    double tr_crossbar_power();
+    void  cb_stats ();
+    double arb_power();
+    void  arb_stats ();
+    double buffer_params();
+    void buffer_stats();
+
+
+    //arbiter
+
+    //buffer
+
+    //router params
+    double Vdd;
+
+    void calc_router_parameters();
+    void get_router_area();
+    void get_router_power();
+    void get_router_delay();
+
+    double min_w_pmos;
+
+
+};
+
+#endif
diff --git a/ext/mcpat/cacti/subarray.cc b/ext/mcpat/cacti/subarray.cc
new file mode 100755 (executable)
index 0000000..7cbf7d9
--- /dev/null
@@ -0,0 +1,196 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#include "subarray.h"
+
+Subarray::Subarray(const DynamicParameter & dp_, bool is_fa_):
+  dp(dp_), num_rows(dp.num_r_subarray), num_cols(dp.num_c_subarray),
+  num_cols_fa_cam(dp.tag_num_c_subarray), num_cols_fa_ram(dp.data_num_c_subarray),
+  cell(dp.cell), cam_cell(dp.cam_cell), is_fa(is_fa_)
+{
+        //num_cols=7;
+        //cout<<"num_cols ="<< num_cols <<endl;
+  if (!(is_fa || dp.pure_cam))
+  {
+          num_cols +=(g_ip->add_ecc_b_ ? (int)ceil(num_cols / num_bits_per_ecc_b_) : 0);   // ECC overhead
+          uint32_t ram_num_cells_wl_stitching =
+                  (dp.ram_cell_tech_type == lp_dram)   ? dram_num_cells_wl_stitching_ :
+          (dp.ram_cell_tech_type == comm_dram) ? comm_dram_num_cells_wl_stitching_ : sram_num_cells_wl_stitching_;
+
+          area.h = cell.h * num_rows;
+
+          area.w = cell.w * num_cols +
+          ceil(num_cols / ram_num_cells_wl_stitching) * g_tp.ram_wl_stitching_overhead_;  // stitching overhead
+  }
+  else  //cam fa
+  {
+
+          //should not add dummy row here since the dummy row do not need decoder
+          if (is_fa)// fully associative cache
+          {
+                  num_cols_fa_cam  += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
+                  num_cols_fa_ram  += (g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_ram / num_bits_per_ecc_b_) : 0);
+                  num_cols = num_cols_fa_cam + num_cols_fa_ram;
+          }
+          else
+          {
+                  num_cols_fa_cam  += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
+                  num_cols_fa_ram  = 0;
+                  num_cols = num_cols_fa_cam;
+          }
+
+          area.h = cam_cell.h * (num_rows + 1);//height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
+          area.w = cam_cell.w * num_cols_fa_cam + cell.w * num_cols_fa_ram
+          + ceil((num_cols_fa_cam + num_cols_fa_ram) / sram_num_cells_wl_stitching_)*g_tp.ram_wl_stitching_overhead_
+          + 16*g_tp.wire_local.pitch //the overhead for the NAND gate to connect the two halves
+          + 128*g_tp.wire_local.pitch;//the overhead for the drivers from matchline to wordline of RAM
+  }
+
+  assert(area.h>0);
+  assert(area.w>0);
+  compute_C();
+}
+
+
+
+Subarray::~Subarray()
+{
+}
+
+
+
+double Subarray::get_total_cell_area()
+{
+//  return (is_fa==false? cell.get_area() * num_rows * num_cols
+//               //: cam_cell.h*(num_rows+1)*(num_cols_fa_cam + sram_cell.get_area()*num_cols_fa_ram));
+//               : cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
+//               //: cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam + sram_cell.get_area()*(num_rows+1)*num_cols_fa_ram);//for FA, this area does not include the dummy cells in SRAM arrays.
+
+    if (!(is_fa || dp.pure_cam))
+          return (cell.get_area() * num_rows * num_cols);
+    else if (is_fa)
+    { //for FA, this area includes the dummy cells in SRAM arrays.
+      //return (cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
+      //cout<<"diff" <<cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)- cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram)<<endl;
+      return (cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram));
+    }
+    else
+      return (cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam );
+
+
+}
+
+
+
+void Subarray::compute_C()
+{
+  double c_w_metal = cell.w * g_tp.wire_local.C_per_um;
+  double r_w_metal = cell.w * g_tp.wire_local.R_per_um;
+  double C_b_metal = cell.h * g_tp.wire_local.C_per_um;
+  double C_b_row_drain_C;
+
+  if (dp.is_dram)
+  {
+    C_wl = (gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true) + c_w_metal) * num_cols;
+
+    if (dp.ram_cell_tech_type == comm_dram)
+    {
+      C_bl = num_rows * C_b_metal;
+    }
+    else
+    {
+      C_b_row_drain_C = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;  // due to shared contact
+      C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
+    }
+  }
+  else
+  {
+          if (!(is_fa ||dp.pure_cam))
+          {
+                  C_wl = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
+                                  c_w_metal) * num_cols;
+                  C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;  // due to shared contact
+                  C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
+          }
+          else
+          {
+                 //Following is wordline not matchline
+                 //CAM portion
+                 c_w_metal = cam_cell.w * g_tp.wire_local.C_per_um;
+                 r_w_metal = cam_cell.w * g_tp.wire_local.R_per_um;
+         C_wl_cam = (gate_C_pass(g_tp.cam.cell_a_w, (g_tp.cam.b_w-2*g_tp.cam.cell_a_w)/2.0, false, true)*2 +
+                                  c_w_metal) * num_cols_fa_cam;
+         R_wl_cam = (r_w_metal) * num_cols_fa_cam;
+
+         if (!dp.pure_cam)
+         {
+                 //RAM portion
+                 c_w_metal = cell.w * g_tp.wire_local.C_per_um;
+                 r_w_metal = cell.w * g_tp.wire_local.R_per_um;
+                 C_wl_ram = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
+                                 c_w_metal) * num_cols_fa_ram;
+                 R_wl_ram = (r_w_metal) * num_cols_fa_ram;
+         }
+         else
+         {
+                 C_wl_ram = R_wl_ram =0;
+         }
+         C_wl = C_wl_cam + C_wl_ram;
+         C_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.C_per_um;
+
+         R_wl = R_wl_cam + R_wl_ram;
+         R_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.R_per_um;
+
+         //there are two ways to write to a FA,
+         //1) Write to CAM array then force a match on match line to active the corresponding wordline in RAM;
+         //2) using separate wordline for read/write and search in RAM.
+         //We are using the second approach.
+
+         //Bitline CAM portion This is bitline not searchline. We assume no sharing between bitline and searchline according to SUN's implementations.
+         C_b_metal = cam_cell.h * g_tp.wire_local.C_per_um;
+         C_b_row_drain_C = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;  // due to shared contact
+         C_bl_cam = (num_rows+1) * (C_b_row_drain_C + C_b_metal);
+         //height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
+         C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;  // due to shared contact
+         C_bl = (num_rows +1) * (C_b_row_drain_C + C_b_metal);
+
+          }
+  }
+}
+
+
diff --git a/ext/mcpat/cacti/subarray.h b/ext/mcpat/cacti/subarray.h
new file mode 100755 (executable)
index 0000000..5fb0624
--- /dev/null
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __SUBARRAY_H__
+#define __SUBARRAY_H__
+
+#include "area.h"
+#include "component.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+class Subarray : public Component
+{
+  public:
+    Subarray(const DynamicParameter & dp, bool is_fa_);
+    ~Subarray();
+
+    const DynamicParameter & dp;
+    double  get_total_cell_area();
+    unsigned int num_rows;
+    unsigned int num_cols;
+    int32_t num_cols_fa_cam;
+    int32_t num_cols_fa_ram;
+    Area    cell, cam_cell;
+
+    bool    is_fa;
+    double  C_wl, C_wl_cam, C_wl_ram;
+    double  R_wl, R_wl_cam, R_wl_ram;
+    double  C_bl, C_bl_cam;
+  private:
+
+    void compute_C();  // compute bitline and wordline capacitance
+};
+
+
+
+#endif
+
diff --git a/ext/mcpat/cacti/technology.cc b/ext/mcpat/cacti/technology.cc
new file mode 100644 (file)
index 0000000..a40c6eb
--- /dev/null
@@ -0,0 +1,2921 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include "basic_circuit.h"
+
+#include "parameter.h"
+
+double wire_resistance(double resistivity, double wire_width, double wire_thickness,
+    double barrier_thickness, double dishing_thickness, double alpha_scatter)
+{
+  double resistance;
+  resistance = alpha_scatter * resistivity /((wire_thickness - barrier_thickness - dishing_thickness)*(wire_width - 2 * barrier_thickness));
+  return(resistance);
+}
+
+double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing,
+    double ild_thickness, double miller_value, double horiz_dielectric_constant,
+    double vert_dielectric_constant, double fringe_cap)
+{
+  double vertical_cap, sidewall_cap, total_cap;
+  vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant * wire_width / ild_thickness;
+  sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value * horiz_dielectric_constant * wire_thickness / wire_spacing;
+  total_cap = vertical_cap + sidewall_cap + fringe_cap;
+  return(total_cap);
+}
+
+
+void init_tech_params(double technology, bool is_tag)
+{
+  int    iter, tech, tech_lo, tech_hi;
+  double curr_alpha, curr_vpp;
+  double wire_width, wire_thickness, wire_spacing,
+         fringe_cap, pmos_to_nmos_sizing_r;
+//  double aspect_ratio,ild_thickness, miller_value = 1.5, horiz_dielectric_constant, vert_dielectric_constant;
+  double barrier_thickness, dishing_thickness, alpha_scatter;
+  double curr_vdd_dram_cell, curr_v_th_dram_access_transistor, curr_I_on_dram_cell, curr_c_dram_cell;
+
+  uint32_t ram_cell_tech_type    = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
+  uint32_t peri_global_tech_type = (is_tag) ? g_ip->tag_arr_peri_global_tech_type : g_ip->data_arr_peri_global_tech_type;
+
+  technology  = technology * 1000.0;  // in the unit of nm
+
+  // initialize parameters
+  g_tp.reset();
+  double gmp_to_gmn_multiplier_periph_global = 0;
+
+  double curr_Wmemcella_dram, curr_Wmemcellpmos_dram, curr_Wmemcellnmos_dram,
+         curr_area_cell_dram, curr_asp_ratio_cell_dram, curr_Wmemcella_sram,
+         curr_Wmemcellpmos_sram, curr_Wmemcellnmos_sram, curr_area_cell_sram,
+         curr_asp_ratio_cell_sram, curr_I_off_dram_cell_worst_case_length_temp;
+  double curr_Wmemcella_cam, curr_Wmemcellpmos_cam, curr_Wmemcellnmos_cam, curr_area_cell_cam,//Sheng: CAM data
+         curr_asp_ratio_cell_cam;
+  double SENSE_AMP_D, SENSE_AMP_P; // J
+  double area_cell_dram = 0;
+  double asp_ratio_cell_dram = 0;
+  double area_cell_sram = 0;
+  double asp_ratio_cell_sram = 0;
+  double area_cell_cam = 0;
+  double asp_ratio_cell_cam = 0;
+  double mobility_eff_periph_global = 0;
+  double Vdsat_periph_global = 0;
+  double nmos_effective_resistance_multiplier;
+  double width_dram_access_transistor;
+
+  double curr_logic_scaling_co_eff = 0;//This is based on the reported numbers of Intel Merom 65nm, Penryn45nm and IBM cell 90/65/45 date
+  double curr_core_tx_density = 0;//this is density per um^2; 90, ...22nm based on Intel Penryn
+  double curr_chip_layout_overhead = 0;
+  double curr_macro_layout_overhead = 0;
+  double curr_sckt_co_eff = 0;
+
+  if (technology < 181 && technology > 179)
+      {
+        tech_lo = 180;
+        tech_hi = 180;
+      }
+  else if (technology < 91 && technology > 89)
+  {
+    tech_lo = 90;
+    tech_hi = 90;
+  }
+  else if (technology < 66 && technology > 64)
+  {
+    tech_lo = 65;
+    tech_hi = 65;
+  }
+  else if (technology < 46 && technology > 44)
+  {
+    tech_lo = 45;
+    tech_hi = 45;
+  }
+  else if (technology < 33 && technology > 31)
+  {
+    tech_lo = 32;
+    tech_hi = 32;
+  }
+  else if (technology < 23 && technology > 21)
+  {
+    tech_lo = 22;
+    tech_hi = 22;
+    if (ram_cell_tech_type == 3 )
+    {
+       cout<<"current version does not support eDRAM technologies at 22nm"<<endl;
+       exit(0);
+    }
+  }
+//  else if (technology < 17 && technology > 15)
+//  {
+//    tech_lo = 16;
+//    tech_hi = 16;
+//  }
+  else if (technology < 180 && technology > 90)
+    {
+      tech_lo = 180;
+      tech_hi = 90;
+    }
+  else if (technology < 90 && technology > 65)
+  {
+    tech_lo = 90;
+    tech_hi = 65;
+  }
+  else if (technology < 65 && technology > 45)
+  {
+    tech_lo = 65;
+    tech_hi = 45;
+  }
+  else if (technology < 45 && technology > 32)
+  {
+    tech_lo = 45;
+    tech_hi = 32;
+  }
+  else if (technology < 32 && technology > 22)
+    {
+      tech_lo = 32;
+      tech_hi = 22;
+    }
+//  else if (technology < 22 && technology > 16)
+//    {
+//      tech_lo = 22;
+//      tech_hi = 16;
+//    }
+      else
+    {
+          cout<<"Invalid technology nodes"<<endl;
+          exit(0);
+    }
+
+  double vdd[NUMBER_TECH_FLAVORS];
+  double Lphy[NUMBER_TECH_FLAVORS];
+  double Lelec[NUMBER_TECH_FLAVORS];
+  double t_ox[NUMBER_TECH_FLAVORS];
+  double v_th[NUMBER_TECH_FLAVORS];
+  double c_ox[NUMBER_TECH_FLAVORS];
+  double mobility_eff[NUMBER_TECH_FLAVORS];
+  double Vdsat[NUMBER_TECH_FLAVORS];
+  double c_g_ideal[NUMBER_TECH_FLAVORS];
+  double c_fringe[NUMBER_TECH_FLAVORS];
+  double c_junc[NUMBER_TECH_FLAVORS];
+  double I_on_n[NUMBER_TECH_FLAVORS];
+  double I_on_p[NUMBER_TECH_FLAVORS];
+  double Rnchannelon[NUMBER_TECH_FLAVORS];
+  double Rpchannelon[NUMBER_TECH_FLAVORS];
+  double n_to_p_eff_curr_drv_ratio[NUMBER_TECH_FLAVORS];
+  double I_off_n[NUMBER_TECH_FLAVORS][101];
+  double I_g_on_n[NUMBER_TECH_FLAVORS][101];
+  //double I_off_p[NUMBER_TECH_FLAVORS][101];
+  double gmp_to_gmn_multiplier[NUMBER_TECH_FLAVORS];
+  //double curr_sckt_co_eff[NUMBER_TECH_FLAVORS];
+  double long_channel_leakage_reduction[NUMBER_TECH_FLAVORS];
+
+  for (iter = 0; iter <= 1; ++iter)
+  {
+    // linear interpolation
+    if (iter == 0)
+    {
+      tech = tech_lo;
+      if (tech_lo == tech_hi)
+      {
+        curr_alpha = 1;
+      }
+      else
+      {
+        curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi);
+      }
+    }
+    else
+    {
+      tech = tech_hi;
+      if (tech_lo == tech_hi)
+      {
+        break;
+      }
+      else
+      {
+        curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 180)
+    {
+      //180nm technology-node. Corresponds to year 1999 in ITRS
+      //Only HP transistor was of interest that 180nm since leakage power was not a big issue. Performance was the king
+      //MASTAR does not contain data for 0.18um process. The following parameters are projected based on ITRS 2000 update and IBM 0.18 Cu Spice input
+      bool Aggre_proj = false;
+      SENSE_AMP_D = .28e-9; // s
+      SENSE_AMP_P = 14.7e-15; // J
+      vdd[0]   = 1.5;
+      Lphy[0]  = 0.12;//Lphy is the physical gate-length. micron
+      Lelec[0] = 0.10;//Lelec is the electrical gate-length. micron
+      t_ox[0]  = 1.2e-3*(Aggre_proj? 1.9/1.2:2);//micron
+      v_th[0]  = Aggre_proj? 0.36 : 0.4407;//V
+      c_ox[0]  = 1.79e-14*(Aggre_proj? 1.9/1.2:2);//F/micron2
+      mobility_eff[0] = 302.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+      Vdsat[0] = 0.128*2; //V
+      c_g_ideal[0] = (Aggre_proj? 1.9/1.2:2)*6.64e-16;//F/micron
+      c_fringe[0]  = (Aggre_proj? 1.9/1.2:2)*0.08e-15;//F/micron
+      c_junc[0] = (Aggre_proj? 1.9/1.2:2)*1e-15;//F/micron2
+      I_on_n[0] = 750e-6;//A/micron
+      I_on_p[0] = 350e-6;//A/micron
+      //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline
+      nmos_effective_resistance_multiplier = 1.54;
+      n_to_p_eff_curr_drv_ratio[0] = 2.45;
+      gmp_to_gmn_multiplier[0] = 1.22;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1;
+      I_off_n[0][0]  = 7e-10;//A/micron
+      I_off_n[0][10] = 8.26e-10;
+      I_off_n[0][20] = 9.74e-10;
+      I_off_n[0][30] = 1.15e-9;
+      I_off_n[0][40] = 1.35e-9;
+      I_off_n[0][50] = 1.60e-9;
+      I_off_n[0][60] = 1.88e-9;
+      I_off_n[0][70] = 2.29e-9;
+      I_off_n[0][80] = 2.70e-9;
+      I_off_n[0][90] = 3.19e-9;
+      I_off_n[0][100] = 3.76e-9;
+
+      I_g_on_n[0][0]  = 1.65e-10;//A/micron
+      I_g_on_n[0][10] = 1.65e-10;
+      I_g_on_n[0][20] = 1.65e-10;
+      I_g_on_n[0][30] = 1.65e-10;
+      I_g_on_n[0][40] = 1.65e-10;
+      I_g_on_n[0][50] = 1.65e-10;
+      I_g_on_n[0][60] = 1.65e-10;
+      I_g_on_n[0][70] = 1.65e-10;
+      I_g_on_n[0][80] = 1.65e-10;
+      I_g_on_n[0][90] = 1.65e-10;
+      I_g_on_n[0][100] = 1.65e-10;
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360
+      curr_asp_ratio_cell_cam = 2.92;//2.5
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff  = 1.5;//linear scaling from 90nm
+      curr_core_tx_density       = 1.25*0.7*0.7*0.4;
+      curr_sckt_co_eff           = 1.11;
+      curr_chip_layout_overhead  = 1.0;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.0;//EDA placement and routing tool rule of thumb
+
+    }
+
+    if (tech == 90)
+    {
+      SENSE_AMP_D = .28e-9; // s
+      SENSE_AMP_P = 14.7e-15; // J
+      //90nm technology-node. Corresponds to year 2004 in ITRS
+      //ITRS HP device type
+      vdd[0]   = 1.2;
+      Lphy[0]  = 0.037;//Lphy is the physical gate-length. micron
+      Lelec[0] = 0.0266;//Lelec is the electrical gate-length. micron
+      t_ox[0]  = 1.2e-3;//micron
+      v_th[0]  = 0.23707;//V
+      c_ox[0]  = 1.79e-14;//F/micron2
+      mobility_eff[0] = 342.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+      Vdsat[0] = 0.128; //V
+      c_g_ideal[0] = 6.64e-16;//F/micron
+      c_fringe[0]  = 0.08e-15;//F/micron
+      c_junc[0] = 1e-15;//F/micron2
+      I_on_n[0] = 1076.9e-6;//A/micron
+      I_on_p[0] = 712.6e-6;//A/micron
+      //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline
+      nmos_effective_resistance_multiplier = 1.54;
+      n_to_p_eff_curr_drv_ratio[0] = 2.45;
+      gmp_to_gmn_multiplier[0] = 1.22;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1;
+      I_off_n[0][0]  = 3.24e-8;//A/micron
+      I_off_n[0][10] = 4.01e-8;
+      I_off_n[0][20] = 4.90e-8;
+      I_off_n[0][30] = 5.92e-8;
+      I_off_n[0][40] = 7.08e-8;
+      I_off_n[0][50] = 8.38e-8;
+      I_off_n[0][60] = 9.82e-8;
+      I_off_n[0][70] = 1.14e-7;
+      I_off_n[0][80] = 1.29e-7;
+      I_off_n[0][90] = 1.43e-7;
+      I_off_n[0][100] = 1.54e-7;
+
+      I_g_on_n[0][0]  = 1.65e-8;//A/micron
+      I_g_on_n[0][10] = 1.65e-8;
+      I_g_on_n[0][20] = 1.65e-8;
+      I_g_on_n[0][30] = 1.65e-8;
+      I_g_on_n[0][40] = 1.65e-8;
+      I_g_on_n[0][50] = 1.65e-8;
+      I_g_on_n[0][60] = 1.65e-8;
+      I_g_on_n[0][70] = 1.65e-8;
+      I_g_on_n[0][80] = 1.65e-8;
+      I_g_on_n[0][90] = 1.65e-8;
+      I_g_on_n[0][100] = 1.65e-8;
+
+      //ITRS LSTP device type
+      vdd[1]   = 1.3;
+      Lphy[1]  = 0.075;
+      Lelec[1] = 0.0486;
+      t_ox[1]  = 2.2e-3;
+      v_th[1]  = 0.48203;
+      c_ox[1]  = 1.22e-14;
+      mobility_eff[1] = 356.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.373;
+      c_g_ideal[1] = 9.15e-16;
+      c_fringe[1]  = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 503.6e-6;
+      I_on_p[1] = 235.1e-6;
+      nmos_effective_resistance_multiplier = 1.92;
+      n_to_p_eff_curr_drv_ratio[1] = 2.44;
+      gmp_to_gmn_multiplier[1] =0.88;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1;
+      I_off_n[1][0]  = 2.81e-12;
+      I_off_n[1][10] = 4.76e-12;
+      I_off_n[1][20] = 7.82e-12;
+      I_off_n[1][30] = 1.25e-11;
+      I_off_n[1][40] = 1.94e-11;
+      I_off_n[1][50] = 2.94e-11;
+      I_off_n[1][60] = 4.36e-11;
+      I_off_n[1][70] = 6.32e-11;
+      I_off_n[1][80] = 8.95e-11;
+      I_off_n[1][90] = 1.25e-10;
+      I_off_n[1][100] = 1.7e-10;
+
+      I_g_on_n[1][0]  = 3.87e-11;//A/micron
+      I_g_on_n[1][10] = 3.87e-11;
+      I_g_on_n[1][20] = 3.87e-11;
+      I_g_on_n[1][30] = 3.87e-11;
+      I_g_on_n[1][40] = 3.87e-11;
+      I_g_on_n[1][50] = 3.87e-11;
+      I_g_on_n[1][60] = 3.87e-11;
+      I_g_on_n[1][70] = 3.87e-11;
+      I_g_on_n[1][80] = 3.87e-11;
+      I_g_on_n[1][90] = 3.87e-11;
+      I_g_on_n[1][100] = 3.87e-11;
+
+      //ITRS LOP device type
+      vdd[2] = 0.9;
+      Lphy[2] = 0.053;
+      Lelec[2] = 0.0354;
+      t_ox[2] = 1.5e-3;
+      v_th[2] = 0.30764;
+      c_ox[2] = 1.59e-14;
+      mobility_eff[2] = 460.39 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.113;
+      c_g_ideal[2] = 8.45e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 386.6e-6;
+      I_on_p[2] = 209.7e-6;
+      nmos_effective_resistance_multiplier = 1.77;
+      n_to_p_eff_curr_drv_ratio[2] = 2.54;
+      gmp_to_gmn_multiplier[2] = 0.98;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1;
+      I_off_n[2][0] = 2.14e-9;
+      I_off_n[2][10] = 2.9e-9;
+      I_off_n[2][20] = 3.87e-9;
+      I_off_n[2][30] = 5.07e-9;
+      I_off_n[2][40] = 6.54e-9;
+      I_off_n[2][50] = 8.27e-8;
+      I_off_n[2][60] = 1.02e-7;
+      I_off_n[2][70] = 1.20e-7;
+      I_off_n[2][80] = 1.36e-8;
+      I_off_n[2][90] = 1.52e-8;
+      I_off_n[2][100] = 1.73e-8;
+
+      I_g_on_n[2][0]  = 4.31e-8;//A/micron
+      I_g_on_n[2][10] = 4.31e-8;
+      I_g_on_n[2][20] = 4.31e-8;
+      I_g_on_n[2][30] = 4.31e-8;
+      I_g_on_n[2][40] = 4.31e-8;
+      I_g_on_n[2][50] = 4.31e-8;
+      I_g_on_n[2][60] = 4.31e-8;
+      I_g_on_n[2][70] = 4.31e-8;
+      I_g_on_n[2][80] = 4.31e-8;
+      I_g_on_n[2][90] = 4.31e-8;
+      I_g_on_n[2][100] = 4.31e-8;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.4545;
+        width_dram_access_transistor = 0.14;
+        curr_I_on_dram_cell = 45e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 21.1e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.168;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.4545;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] =  323.95 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.3;
+        c_g_ideal[3] = 1.47e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 321.6e-6;
+        I_on_p[3] = 203.3e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.42e-11;
+        I_off_n[3][10] = 2.25e-11;
+        I_off_n[3][20] = 3.46e-11;
+        I_off_n[3][30] = 5.18e-11;
+        I_off_n[3][40] = 7.58e-11;
+        I_off_n[3][50] = 1.08e-10;
+        I_off_n[3][60] = 1.51e-10;
+        I_off_n[3][70] = 2.02e-10;
+        I_off_n[3][80] = 2.57e-10;
+        I_off_n[3][90] = 3.14e-10;
+        I_off_n[3][100] = 3.85e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.6;
+        Lphy[3] = 0.09;
+        Lelec[3] = 0.0576;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.09*0.09;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.7;
+        t_ox[3] = 5.5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 5.65e-15;
+        mobility_eff[3] =  302.2 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.32;
+        c_g_ideal[3] = 5.08e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1094.3e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.62;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 5.80e-15;
+        I_off_n[3][10] = 1.21e-14;
+        I_off_n[3][20] = 2.42e-14;
+        I_off_n[3][30] = 4.65e-14;
+        I_off_n[3][40] = 8.60e-14;
+        I_off_n[3][50] = 1.54e-13;
+        I_off_n[3][60] = 2.66e-13;
+        I_off_n[3][70] = 4.45e-13;
+        I_off_n[3][80] = 7.17e-13;
+        I_off_n[3][90] = 1.11e-12;
+        I_off_n[3][100] = 1.67e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360
+      curr_asp_ratio_cell_cam = 2.92;//2.5
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff  = 1;
+      curr_core_tx_density       = 1.25*0.7*0.7;
+      curr_sckt_co_eff           = 1.1539;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+
+
+    }
+
+    if (tech == 65)
+    { //65nm technology-node. Corresponds to year 2007 in ITRS
+      //ITRS HP device type
+      SENSE_AMP_D = .2e-9; // s
+      SENSE_AMP_P = 5.7e-15; // J
+      vdd[0] = 1.1;
+      Lphy[0] = 0.025;
+      Lelec[0] = 0.019;
+      t_ox[0] = 1.1e-3;
+      v_th[0] = .19491;
+      c_ox[0] = 1.88e-14;
+      mobility_eff[0] = 436.24 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 7.71e-2;
+      c_g_ideal[0] = 4.69e-16;
+      c_fringe[0] = 0.077e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 1197.2e-6;
+      I_on_p[0] = 870.8e-6;
+      nmos_effective_resistance_multiplier = 1.50;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] = 1/3.74;
+      //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first
+      //Ioff(Lgate normal)/Ioff(Lgate long)= 3.74.
+      I_off_n[0][0] = 1.96e-7;
+      I_off_n[0][10] = 2.29e-7;
+      I_off_n[0][20] = 2.66e-7;
+      I_off_n[0][30] = 3.05e-7;
+      I_off_n[0][40] = 3.49e-7;
+      I_off_n[0][50] = 3.95e-7;
+      I_off_n[0][60] = 4.45e-7;
+      I_off_n[0][70] = 4.97e-7;
+      I_off_n[0][80] = 5.48e-7;
+      I_off_n[0][90] = 5.94e-7;
+      I_off_n[0][100] = 6.3e-7;
+      I_g_on_n[0][0]  = 4.09e-8;//A/micron
+      I_g_on_n[0][10] = 4.09e-8;
+      I_g_on_n[0][20] = 4.09e-8;
+      I_g_on_n[0][30] = 4.09e-8;
+      I_g_on_n[0][40] = 4.09e-8;
+      I_g_on_n[0][50] = 4.09e-8;
+      I_g_on_n[0][60] = 4.09e-8;
+      I_g_on_n[0][70] = 4.09e-8;
+      I_g_on_n[0][80] = 4.09e-8;
+      I_g_on_n[0][90] = 4.09e-8;
+      I_g_on_n[0][100] = 4.09e-8;
+
+      //ITRS LSTP device type
+      vdd[1] = 1.2;
+      Lphy[1] = 0.045;
+      Lelec[1] = 0.0298;
+      t_ox[1] = 1.9e-3;
+      v_th[1] = 0.52354;
+      c_ox[1] = 1.36e-14;
+      mobility_eff[1] = 341.21 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.128;
+      c_g_ideal[1] = 6.14e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 519.2e-6;
+      I_on_p[1] = 266e-6;
+      nmos_effective_resistance_multiplier = 1.96;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/2.82;
+      I_off_n[1][0] = 9.12e-12;
+      I_off_n[1][10] = 1.49e-11;
+      I_off_n[1][20] = 2.36e-11;
+      I_off_n[1][30] = 3.64e-11;
+      I_off_n[1][40] = 5.48e-11;
+      I_off_n[1][50] = 8.05e-11;
+      I_off_n[1][60] = 1.15e-10;
+      I_off_n[1][70] = 1.59e-10;
+      I_off_n[1][80] = 2.1e-10;
+      I_off_n[1][90] = 2.62e-10;
+      I_off_n[1][100] = 3.21e-10;
+
+      I_g_on_n[1][0]  = 1.09e-10;//A/micron
+      I_g_on_n[1][10] = 1.09e-10;
+      I_g_on_n[1][20] = 1.09e-10;
+      I_g_on_n[1][30] = 1.09e-10;
+      I_g_on_n[1][40] = 1.09e-10;
+      I_g_on_n[1][50] = 1.09e-10;
+      I_g_on_n[1][60] = 1.09e-10;
+      I_g_on_n[1][70] = 1.09e-10;
+      I_g_on_n[1][80] = 1.09e-10;
+      I_g_on_n[1][90] = 1.09e-10;
+      I_g_on_n[1][100] = 1.09e-10;
+
+      //ITRS LOP device type
+      vdd[2] = 0.8;
+      Lphy[2] = 0.032;
+      Lelec[2] = 0.0216;
+      t_ox[2] = 1.2e-3;
+      v_th[2] = 0.28512;
+      c_ox[2] = 1.87e-14;
+      mobility_eff[2] = 495.19 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.292;
+      c_g_ideal[2] = 6e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 573.1e-6;
+      I_on_p[2] = 340.6e-6;
+      nmos_effective_resistance_multiplier = 1.82;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/2.05;
+      I_off_n[2][0] = 4.9e-9;
+      I_off_n[2][10] = 6.49e-9;
+      I_off_n[2][20] = 8.45e-9;
+      I_off_n[2][30] = 1.08e-8;
+      I_off_n[2][40] = 1.37e-8;
+      I_off_n[2][50] = 1.71e-8;
+      I_off_n[2][60] = 2.09e-8;
+      I_off_n[2][70] = 2.48e-8;
+      I_off_n[2][80] = 2.84e-8;
+      I_off_n[2][90] = 3.13e-8;
+      I_off_n[2][100] = 3.42e-8;
+
+      I_g_on_n[2][0]  = 9.61e-9;//A/micron
+      I_g_on_n[2][10] = 9.61e-9;
+      I_g_on_n[2][20] = 9.61e-9;
+      I_g_on_n[2][30] = 9.61e-9;
+      I_g_on_n[2][40] = 9.61e-9;
+      I_g_on_n[2][50] = 9.61e-9;
+      I_g_on_n[2][60] = 9.61e-9;
+      I_g_on_n[2][70] = 9.61e-9;
+      I_g_on_n[2][80] = 9.61e-9;
+      I_g_on_n[2][90] = 9.61e-9;
+      I_g_on_n[2][100] = 9.61e-9;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.43806;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 19.6e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.11;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.43806;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] =  328.32 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.43806;
+        c_g_ideal[3] = 1.46e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15 ;
+        I_on_n[3] = 399.8e-6;
+        I_on_p[3] = 243.4e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 2.23e-11;
+        I_off_n[3][10] = 3.46e-11;
+        I_off_n[3][20] = 5.24e-11;
+        I_off_n[3][30] = 7.75e-11;
+        I_off_n[3][40] = 1.12e-10;
+        I_off_n[3][50] = 1.58e-10;
+        I_off_n[3][60] = 2.18e-10;
+        I_off_n[3][70] = 2.88e-10;
+        I_off_n[3][80] = 3.63e-10;
+        I_off_n[3][90] = 4.41e-10;
+        I_off_n[3][100] = 5.36e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.3;
+        Lphy[3] = 0.065;
+        Lelec[3] = 0.0426;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.065;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.065*0.065;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.3;
+        t_ox[3] = 5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 6.16e-15;
+        mobility_eff[3] =  303.44 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.385;
+        c_g_ideal[3] = 4e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15 ;
+        I_on_n[3] = 1031e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 2.39;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 1.80e-14;
+        I_off_n[3][10] = 3.64e-14;
+        I_off_n[3][20] = 7.03e-14;
+        I_off_n[3][30] = 1.31e-13;
+        I_off_n[3][40] = 2.35e-13;
+        I_off_n[3][50] = 4.09e-13;
+        I_off_n[3][60] = 6.89e-13;
+        I_off_n[3][70] = 1.13e-12;
+        I_off_n[3][80] = 1.78e-12;
+        I_off_n[3][90] = 2.71e-12;
+        I_off_n[3][100] = 3.99e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7; //Rather than scale proportionally to square of feature size, only scale linearly according to IBM cell processor
+      curr_core_tx_density      = 1.25*0.7;
+      curr_sckt_co_eff           = 1.1359;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 45)
+    { //45nm technology-node. Corresponds to year 2010 in ITRS
+      //ITRS HP device type
+      SENSE_AMP_D = .04e-9; // s
+      SENSE_AMP_P = 2.7e-15; // J
+      vdd[0] = 1.0;
+      Lphy[0] = 0.018;
+      Lelec[0] = 0.01345;
+      t_ox[0] = 0.65e-3;
+      v_th[0] = .18035;
+      c_ox[0] = 3.77e-14;
+      mobility_eff[0] = 266.68 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 9.38E-2;
+      c_g_ideal[0] = 6.78e-16;
+      c_fringe[0] = 0.05e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 2046.6e-6;
+      //There are certain problems with the ITRS PMOS numbers in MASTAR for 45nm. So we are using 65nm values of
+      //n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier for 45nm
+      I_on_p[0] = I_on_n[0] / 2;//This value is fixed arbitrarily but I_on_p is not being used in CACTI
+      nmos_effective_resistance_multiplier = 1.51;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] = 1/3.546;//Using MASTAR, @380K, increase Lgate until Ion reduces to 90%, Ioff(Lgate normal)/Ioff(Lgate long)= 3.74
+      I_off_n[0][0] = 2.8e-7;
+      I_off_n[0][10] = 3.28e-7;
+      I_off_n[0][20] = 3.81e-7;
+      I_off_n[0][30] = 4.39e-7;
+      I_off_n[0][40] = 5.02e-7;
+      I_off_n[0][50] = 5.69e-7;
+      I_off_n[0][60] = 6.42e-7;
+      I_off_n[0][70] = 7.2e-7;
+      I_off_n[0][80] = 8.03e-7;
+      I_off_n[0][90] = 8.91e-7;
+      I_off_n[0][100] = 9.84e-7;
+
+      I_g_on_n[0][0]  = 3.59e-8;//A/micron
+      I_g_on_n[0][10] = 3.59e-8;
+      I_g_on_n[0][20] = 3.59e-8;
+      I_g_on_n[0][30] = 3.59e-8;
+      I_g_on_n[0][40] = 3.59e-8;
+      I_g_on_n[0][50] = 3.59e-8;
+      I_g_on_n[0][60] = 3.59e-8;
+      I_g_on_n[0][70] = 3.59e-8;
+      I_g_on_n[0][80] = 3.59e-8;
+      I_g_on_n[0][90] = 3.59e-8;
+      I_g_on_n[0][100] = 3.59e-8;
+
+      //ITRS LSTP device type
+      vdd[1] = 1.1;
+      Lphy[1] =  0.028;
+      Lelec[1] = 0.0212;
+      t_ox[1] = 1.4e-3;
+      v_th[1] = 0.50245;
+      c_ox[1] = 2.01e-14;
+      mobility_eff[1] =  363.96 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 9.12e-2;
+      c_g_ideal[1] = 5.18e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 666.2e-6;
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/2.08;
+      I_off_n[1][0] = 1.01e-11;
+      I_off_n[1][10] = 1.65e-11;
+      I_off_n[1][20] = 2.62e-11;
+      I_off_n[1][30] = 4.06e-11;
+      I_off_n[1][40] = 6.12e-11;
+      I_off_n[1][50] = 9.02e-11;
+      I_off_n[1][60] = 1.3e-10;
+      I_off_n[1][70] = 1.83e-10;
+      I_off_n[1][80] = 2.51e-10;
+      I_off_n[1][90] = 3.29e-10;
+      I_off_n[1][100] = 4.1e-10;
+
+      I_g_on_n[1][0]  = 9.47e-12;//A/micron
+      I_g_on_n[1][10] = 9.47e-12;
+      I_g_on_n[1][20] = 9.47e-12;
+      I_g_on_n[1][30] = 9.47e-12;
+      I_g_on_n[1][40] = 9.47e-12;
+      I_g_on_n[1][50] = 9.47e-12;
+      I_g_on_n[1][60] = 9.47e-12;
+      I_g_on_n[1][70] = 9.47e-12;
+      I_g_on_n[1][80] = 9.47e-12;
+      I_g_on_n[1][90] = 9.47e-12;
+      I_g_on_n[1][100] = 9.47e-12;
+
+      //ITRS LOP device type
+      vdd[2] = 0.7;
+      Lphy[2] = 0.022;
+      Lelec[2] = 0.016;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.22599;
+      c_ox[2] = 2.82e-14;//F/micron2
+      mobility_eff[2] = 508.9 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 5.71e-2;
+      c_g_ideal[2] = 6.2e-16;
+      c_fringe[2] = 0.073e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 748.9e-6;
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.76;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/1.92;
+      I_off_n[2][0] = 4.03e-9;
+      I_off_n[2][10] = 5.02e-9;
+      I_off_n[2][20] = 6.18e-9;
+      I_off_n[2][30] = 7.51e-9;
+      I_off_n[2][40] = 9.04e-9;
+      I_off_n[2][50] = 1.08e-8;
+      I_off_n[2][60] = 1.27e-8;
+      I_off_n[2][70] = 1.47e-8;
+      I_off_n[2][80] = 1.66e-8;
+      I_off_n[2][90] = 1.84e-8;
+      I_off_n[2][100] = 2.03e-8;
+
+      I_g_on_n[2][0]  = 3.24e-8;//A/micron
+      I_g_on_n[2][10] = 4.01e-8;
+      I_g_on_n[2][20] = 4.90e-8;
+      I_g_on_n[2][30] = 5.92e-8;
+      I_g_on_n[2][40] = 7.08e-8;
+      I_g_on_n[2][50] = 8.38e-8;
+      I_g_on_n[2][60] = 9.82e-8;
+      I_g_on_n[2][70] = 1.14e-7;
+      I_g_on_n[2][80] = 1.29e-7;
+      I_g_on_n[2][90] = 1.43e-7;
+      I_g_on_n[2][100] = 1.54e-7;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.078;
+        Lelec[3] = 0.0504;// Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44559;
+        width_dram_access_transistor = 0.079;
+        curr_I_on_dram_cell = 36e-6;//A
+        curr_I_off_dram_cell_worst_case_length_temp = 19.5e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram  = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2.1e-3;
+        v_th[3] = 0.44559;
+        c_ox[3] = 1.41e-14;
+        mobility_eff[3] =   426.30 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.181;
+        c_g_ideal[3] = 1.10e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 456e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 2.54e-11;
+        I_off_n[3][10] = 3.94e-11;
+        I_off_n[3][20] = 5.95e-11;
+        I_off_n[3][30] = 8.79e-11;
+        I_off_n[3][40] = 1.27e-10;
+        I_off_n[3][50] = 1.79e-10;
+        I_off_n[3][60] = 2.47e-10;
+        I_off_n[3][70] = 3.31e-10;
+        I_off_n[3][80] = 4.26e-10;
+        I_off_n[3][90] = 5.27e-10;
+        I_off_n[3][100] = 6.46e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.045;
+        Lelec[3] = 0.0298;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.045;
+        curr_I_on_dram_cell = 20e-6;//A
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram  = 0;
+        curr_area_cell_dram = 6*0.045*0.045;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.7;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.98e-15;
+        mobility_eff[3] = 368.58 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.147;
+        c_g_ideal[3] = 3.59e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 999.4e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.31e-14;
+        I_off_n[3][10] = 2.68e-14;
+        I_off_n[3][20] = 5.25e-14;
+        I_off_n[3][30] = 9.88e-14;
+        I_off_n[3][40] = 1.79e-13;
+        I_off_n[3][50] = 3.15e-13;
+        I_off_n[3][60] = 5.36e-13;
+        I_off_n[3][70] = 8.86e-13;
+        I_off_n[3][80] = 1.42e-12;
+        I_off_n[3][90] = 2.20e-12;
+        I_off_n[3][100] = 3.29e-12;
+      }
+
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7*0.7;
+      curr_core_tx_density      = 1.25;
+      curr_sckt_co_eff           = 1.1387;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 32)
+    {
+      SENSE_AMP_D = .03e-9; // s
+      SENSE_AMP_P = 2.16e-15; // J
+      //For 2013, MPU/ASIC stagger-contacted M1 half-pitch is 32 nm (so this is 32 nm
+      //technology i.e. FEATURESIZE = 0.032). Using the SOI process numbers for
+      //HP and LSTP.
+      vdd[0] = 0.9;
+      Lphy[0] = 0.013;
+      Lelec[0] = 0.01013;
+      t_ox[0] = 0.5e-3;
+      v_th[0] = 0.21835;
+      c_ox[0] = 4.11e-14;
+      mobility_eff[0] = 361.84 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 5.09E-2;
+      c_g_ideal[0] = 5.34e-16;
+      c_fringe[0] = 0.04e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] =  2211.7e-6;
+      I_on_p[0] = I_on_n[0] / 2;
+      nmos_effective_resistance_multiplier = 1.49;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1/3.706;
+      //Using MASTAR, @300K (380K does not work in MASTAR), increase Lgate until Ion reduces to 95% or Lgate increase by 5% (DG device can only increase by 5%),
+      //whichever comes first
+      I_off_n[0][0] = 1.52e-7;
+      I_off_n[0][10] = 1.55e-7;
+      I_off_n[0][20] = 1.59e-7;
+      I_off_n[0][30] = 1.68e-7;
+      I_off_n[0][40] = 1.90e-7;
+      I_off_n[0][50] = 2.69e-7;
+      I_off_n[0][60] = 5.32e-7;
+      I_off_n[0][70] = 1.02e-6;
+      I_off_n[0][80] = 1.62e-6;
+      I_off_n[0][90] = 2.73e-6;
+      I_off_n[0][100] = 6.1e-6;
+
+      I_g_on_n[0][0]  = 6.55e-8;//A/micron
+      I_g_on_n[0][10] = 6.55e-8;
+      I_g_on_n[0][20] = 6.55e-8;
+      I_g_on_n[0][30] = 6.55e-8;
+      I_g_on_n[0][40] = 6.55e-8;
+      I_g_on_n[0][50] = 6.55e-8;
+      I_g_on_n[0][60] = 6.55e-8;
+      I_g_on_n[0][70] = 6.55e-8;
+      I_g_on_n[0][80] = 6.55e-8;
+      I_g_on_n[0][90] = 6.55e-8;
+      I_g_on_n[0][100] = 6.55e-8;
+
+//      32 DG
+//      I_g_on_n[0][0]  = 2.71e-9;//A/micron
+//      I_g_on_n[0][10] = 2.71e-9;
+//      I_g_on_n[0][20] = 2.71e-9;
+//      I_g_on_n[0][30] = 2.71e-9;
+//      I_g_on_n[0][40] = 2.71e-9;
+//      I_g_on_n[0][50] = 2.71e-9;
+//      I_g_on_n[0][60] = 2.71e-9;
+//      I_g_on_n[0][70] = 2.71e-9;
+//      I_g_on_n[0][80] = 2.71e-9;
+//      I_g_on_n[0][90] = 2.71e-9;
+//      I_g_on_n[0][100] = 2.71e-9;
+
+      //LSTP device type
+      vdd[1] = 1;
+      Lphy[1] = 0.020;
+      Lelec[1] = 0.0173;
+      t_ox[1] = 1.2e-3;
+      v_th[1] = 0.513;
+      c_ox[1] = 2.29e-14;
+      mobility_eff[1] =  347.46 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 8.64e-2;
+      c_g_ideal[1] = 4.58e-16;
+      c_fringe[1] = 0.053e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 683.6e-6;
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/1.93;
+      I_off_n[1][0] = 2.06e-11;
+      I_off_n[1][10] = 3.30e-11;
+      I_off_n[1][20] = 5.15e-11;
+      I_off_n[1][30] = 7.83e-11;
+      I_off_n[1][40] = 1.16e-10;
+      I_off_n[1][50] = 1.69e-10;
+      I_off_n[1][60] = 2.40e-10;
+      I_off_n[1][70] = 3.34e-10;
+      I_off_n[1][80] = 4.54e-10;
+      I_off_n[1][90] = 5.96e-10;
+      I_off_n[1][100] = 7.44e-10;
+
+      I_g_on_n[1][0]  = 3.73e-11;//A/micron
+      I_g_on_n[1][10] = 3.73e-11;
+      I_g_on_n[1][20] = 3.73e-11;
+      I_g_on_n[1][30] = 3.73e-11;
+      I_g_on_n[1][40] = 3.73e-11;
+      I_g_on_n[1][50] = 3.73e-11;
+      I_g_on_n[1][60] = 3.73e-11;
+      I_g_on_n[1][70] = 3.73e-11;
+      I_g_on_n[1][80] = 3.73e-11;
+      I_g_on_n[1][90] = 3.73e-11;
+      I_g_on_n[1][100] = 3.73e-11;
+
+
+      //LOP device type
+      vdd[2] = 0.6;
+      Lphy[2] = 0.016;
+      Lelec[2] = 0.01232;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.24227;
+      c_ox[2] = 2.84e-14;
+      mobility_eff[2] =  513.52 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 4.64e-2;
+      c_g_ideal[2] = 4.54e-16;
+      c_fringe[2] = 0.057e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 827.8e-6;
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.73;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/1.89;
+      I_off_n[2][0] = 5.94e-8;
+      I_off_n[2][10] = 7.23e-8;
+      I_off_n[2][20] = 8.7e-8;
+      I_off_n[2][30] = 1.04e-7;
+      I_off_n[2][40] = 1.22e-7;
+      I_off_n[2][50] = 1.43e-7;
+      I_off_n[2][60] = 1.65e-7;
+      I_off_n[2][70] = 1.90e-7;
+      I_off_n[2][80] = 2.15e-7;
+      I_off_n[2][90] = 2.39e-7;
+      I_off_n[2][100] = 2.63e-7;
+
+      I_g_on_n[2][0]  = 2.93e-9;//A/micron
+      I_g_on_n[2][10] = 2.93e-9;
+      I_g_on_n[2][20] = 2.93e-9;
+      I_g_on_n[2][30] = 2.93e-9;
+      I_g_on_n[2][40] = 2.93e-9;
+      I_g_on_n[2][50] = 2.93e-9;
+      I_g_on_n[2][60] = 2.93e-9;
+      I_g_on_n[2][70] = 2.93e-9;
+      I_g_on_n[2][80] = 2.93e-9;
+      I_g_on_n[2][90] = 2.93e-9;
+      I_g_on_n[2][100] = 2.93e-9;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.056;
+        Lelec[3] = 0.0419;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44129;
+        width_dram_access_transistor = 0.056;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 18.9e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2e-3;
+        v_th[3] = 0.44467;
+        c_ox[3] = 1.48e-14;
+        mobility_eff[3] =  408.12 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.174;
+        c_g_ideal[3] = 7.45e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1055.4e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 3.57e-11;
+        I_off_n[3][10] = 5.51e-11;
+        I_off_n[3][20] = 8.27e-11;
+        I_off_n[3][30] = 1.21e-10;
+        I_off_n[3][40] = 1.74e-10;
+        I_off_n[3][50] = 2.45e-10;
+        I_off_n[3][60] = 3.38e-10;
+        I_off_n[3][70] = 4.53e-10;
+        I_off_n[3][80] = 5.87e-10;
+        I_off_n[3][90] = 7.29e-10;
+        I_off_n[3][100] = 8.87e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.032;
+        Lelec[3] = 0.0205;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.032;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.032*0.032;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.6;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.99e-15;
+        mobility_eff[3] =  380.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.129;
+        c_g_ideal[3] = 2.56e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1024.5e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 3.63e-14;
+        I_off_n[3][10] = 7.18e-14;
+        I_off_n[3][20] = 1.36e-13;
+        I_off_n[3][30] = 2.49e-13;
+        I_off_n[3][40] = 4.41e-13;
+        I_off_n[3][50] = 7.55e-13;
+        I_off_n[3][60] = 1.26e-12;
+        I_off_n[3][70] = 2.03e-12;
+        I_off_n[3][80] = 3.19e-12;
+        I_off_n[3][90] = 4.87e-12;
+        I_off_n[3][100] = 7.16e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7*0.7*0.7;
+      curr_core_tx_density      = 1.25/0.7;
+      curr_sckt_co_eff           = 1.1111;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if(tech == 22){
+        SENSE_AMP_D = .03e-9; // s
+        SENSE_AMP_P = 2.16e-15; // J
+        //For 2016, MPU/ASIC stagger-contacted M1 half-pitch is 22 nm (so this is 22 nm
+        //technology i.e. FEATURESIZE = 0.022). Using the DG process numbers for HP.
+        //22 nm HP
+        vdd[0] = 0.8;
+        Lphy[0] = 0.009;//Lphy is the physical gate-length.
+        Lelec[0] = 0.00468;//Lelec is the electrical gate-length.
+        t_ox[0] = 0.55e-3;//micron
+        v_th[0] = 0.1395;//V
+        c_ox[0] = 3.63e-14;//F/micron2
+        mobility_eff[0] = 426.07 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[0] = 2.33e-2; //V/micron
+        c_g_ideal[0] = 3.27e-16;//F/micron
+        c_fringe[0] = 0.06e-15;//F/micron
+        c_junc[0] = 0;//F/micron2
+        I_on_n[0] =  2626.4e-6;//A/micron
+        I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier = 1.45;
+        n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+        //"Dynamic" tab of Device workspace.
+        gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value.
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+        long_channel_leakage_reduction[0] = 1/3.274;
+        I_off_n[0][0] = 1.52e-7/1.5*1.2;//From 22nm, leakage current are directly from ITRS report rather than MASTAR, since MASTAR has serious bugs there.
+        I_off_n[0][10] = 1.55e-7/1.5*1.2;
+        I_off_n[0][20] = 1.59e-7/1.5*1.2;
+        I_off_n[0][30] = 1.68e-7/1.5*1.2;
+        I_off_n[0][40] = 1.90e-7/1.5*1.2;
+        I_off_n[0][50] = 2.69e-7/1.5*1.2;
+        I_off_n[0][60] = 5.32e-7/1.5*1.2;
+        I_off_n[0][70] = 1.02e-6/1.5*1.2;
+        I_off_n[0][80] = 1.62e-6/1.5*1.2;
+        I_off_n[0][90] = 2.73e-6/1.5*1.2;
+        I_off_n[0][100] = 6.1e-6/1.5*1.2;
+        //for 22nm DG HP
+        I_g_on_n[0][0]  = 1.81e-9;//A/micron
+        I_g_on_n[0][10] = 1.81e-9;
+        I_g_on_n[0][20] = 1.81e-9;
+        I_g_on_n[0][30] = 1.81e-9;
+        I_g_on_n[0][40] = 1.81e-9;
+        I_g_on_n[0][50] = 1.81e-9;
+        I_g_on_n[0][60] = 1.81e-9;
+        I_g_on_n[0][70] = 1.81e-9;
+        I_g_on_n[0][80] = 1.81e-9;
+        I_g_on_n[0][90] = 1.81e-9;
+        I_g_on_n[0][100] = 1.81e-9;
+
+        //22 nm LSTP DG
+        vdd[1] = 0.8;
+        Lphy[1] = 0.014;
+        Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+        t_ox[1] = 1.1e-3;//micron
+        v_th[1] = 0.40126;//V
+        c_ox[1] = 2.30e-14;//F/micron2
+        mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[1] = 6.64e-2; //V/micron
+        c_g_ideal[1] = 3.22e-16;//F/micron
+        c_fringe[1] = 0.08e-15;
+        c_junc[1] = 0;//F/micron2
+        I_on_n[1] = 727.6e-6;//A/micron
+        I_on_p[1] = I_on_n[1] / 2;
+        nmos_effective_resistance_multiplier = 1.99;
+        n_to_p_eff_curr_drv_ratio[1] = 2;
+        gmp_to_gmn_multiplier[1] = 0.99;
+        Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron
+        Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron
+        long_channel_leakage_reduction[1] = 1/1.89;
+        I_off_n[1][0] = 2.43e-11;
+        I_off_n[1][10] = 4.85e-11;
+        I_off_n[1][20] = 9.68e-11;
+        I_off_n[1][30] = 1.94e-10;
+        I_off_n[1][40] = 3.87e-10;
+        I_off_n[1][50] = 7.73e-10;
+        I_off_n[1][60] = 3.55e-10;
+        I_off_n[1][70] = 3.09e-9;
+        I_off_n[1][80] = 6.19e-9;
+        I_off_n[1][90] = 1.24e-8;
+        I_off_n[1][100]= 2.48e-8;
+
+        I_g_on_n[1][0]  = 4.51e-10;//A/micron
+        I_g_on_n[1][10] = 4.51e-10;
+        I_g_on_n[1][20] = 4.51e-10;
+        I_g_on_n[1][30] = 4.51e-10;
+        I_g_on_n[1][40] = 4.51e-10;
+        I_g_on_n[1][50] = 4.51e-10;
+        I_g_on_n[1][60] = 4.51e-10;
+        I_g_on_n[1][70] = 4.51e-10;
+        I_g_on_n[1][80] = 4.51e-10;
+        I_g_on_n[1][90] = 4.51e-10;
+        I_g_on_n[1][100] = 4.51e-10;
+
+        //22 nm LOP
+        vdd[2] = 0.6;
+        Lphy[2] = 0.011;
+        Lelec[2] = 0.00604;//Lelec is the electrical gate-length.
+        t_ox[2] = 0.8e-3;//micron
+        v_th[2] = 0.2315;//V
+        c_ox[2] = 2.87e-14;//F/micron2
+        mobility_eff[2] =  698.37 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[2] = 1.81e-2; //V/micron
+        c_g_ideal[2] = 3.16e-16;//F/micron
+        c_fringe[2] = 0.08e-15;
+        c_junc[2] = 0;//F/micron2 This is Cj0 not Cjunc in MASTAR results->Dynamic Tab
+        I_on_n[2] = 916.1e-6;//A/micron
+        I_on_p[2] = I_on_n[2] / 2;
+        nmos_effective_resistance_multiplier = 1.73;
+        n_to_p_eff_curr_drv_ratio[2] = 2;
+        gmp_to_gmn_multiplier[2] = 1.11;
+        Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];//ohm-micron
+        Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];//ohm-micron
+        long_channel_leakage_reduction[2] = 1/2.38;
+
+        I_off_n[2][0] = 1.31e-8;
+        I_off_n[2][10] = 2.60e-8;
+        I_off_n[2][20] = 5.14e-8;
+        I_off_n[2][30] = 1.02e-7;
+        I_off_n[2][40] = 2.02e-7;
+        I_off_n[2][50] = 3.99e-7;
+        I_off_n[2][60] = 7.91e-7;
+        I_off_n[2][70] = 1.09e-6;
+        I_off_n[2][80] = 2.09e-6;
+        I_off_n[2][90] = 4.04e-6;
+        I_off_n[2][100]= 4.48e-6;
+
+        I_g_on_n[2][0]  = 2.74e-9;//A/micron
+        I_g_on_n[2][10] = 2.74e-9;
+        I_g_on_n[2][20] = 2.74e-9;
+        I_g_on_n[2][30] = 2.74e-9;
+        I_g_on_n[2][40] = 2.74e-9;
+        I_g_on_n[2][50] = 2.74e-9;
+        I_g_on_n[2][60] = 2.74e-9;
+        I_g_on_n[2][70] = 2.74e-9;
+        I_g_on_n[2][80] = 2.74e-9;
+        I_g_on_n[2][90] = 2.74e-9;
+        I_g_on_n[2][100] = 2.74e-9;
+
+
+
+        if (ram_cell_tech_type == 3)
+              {}
+        else if (ram_cell_tech_type == 4)
+        {
+        //22 nm commodity DRAM cell access transistor technology parameters.
+                //parameters
+                curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In
+                //2005 ITRS, the value was about twice the value in 2007 ITRS
+                Lphy[3] = 0.022;//micron
+                Lelec[3] = 0.0181;//micron.
+                curr_v_th_dram_access_transistor = 1;//V
+                width_dram_access_transistor = 0.022;//micron
+                curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always
+                //kept constant. In reality this could perhaps be lower
+                curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A
+                curr_Wmemcella_dram = width_dram_access_transistor;
+                curr_Wmemcellpmos_dram = 0;
+                curr_Wmemcellnmos_dram = 0;
+                curr_area_cell_dram = 6*0.022*0.022;//micron2.
+                curr_asp_ratio_cell_dram = 0.667;
+                curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus
+                //kept constant.
+
+        //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR.
+                curr_vpp = 2.3;//vpp. V
+                t_ox[3] = 3.5e-3;//micron
+                v_th[3] = 1.0;//V
+                c_ox[3] = 9.06e-15;//F/micron2
+                mobility_eff[3] =  367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs
+                Vdsat[3] = 0.0972; //V/micron
+                c_g_ideal[3] = 1.99e-16;//F/micron
+                c_fringe[3] = 0.053e-15;//F/micron
+                c_junc[3] = 1e-15;//F/micron2
+                I_on_n[3] = 910.5e-6;//A/micron
+                I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used.
+                nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm.
+                //
+                n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm
+                gmp_to_gmn_multiplier[3] = 0.90;
+                Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp  / I_on_n[3];//ohm-micron
+                Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron
+                long_channel_leakage_reduction[3] = 1;
+                I_off_n[3][0] = 1.1e-13; //A/micron
+                I_off_n[3][10] = 2.11e-13;
+                I_off_n[3][20] = 3.88e-13;
+                I_off_n[3][30] = 6.9e-13;
+                I_off_n[3][40] = 1.19e-12;
+                I_off_n[3][50] = 1.98e-12;
+                I_off_n[3][60] = 3.22e-12;
+                I_off_n[3][70] = 5.09e-12;
+                I_off_n[3][80] = 7.85e-12;
+                I_off_n[3][90] = 1.18e-11;
+                I_off_n[3][100] = 1.72e-11;
+
+        }
+        else
+        {
+          //some error handler
+        }
+
+        //SRAM cell properties
+        curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_sram = 1.46;
+        //CAM cell properties //TODO: data need to be revisited
+        curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_cam = 2.92;
+        //Empirical undifferetiated core/FU coefficient
+        curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7;
+        curr_core_tx_density      = 1.25/0.7/0.7;
+        curr_sckt_co_eff           = 1.1296;
+        curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+        curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+        }
+
+    if(tech == 16){
+        //For 2019, MPU/ASIC stagger-contacted M1 half-pitch is 16 nm (so this is 16 nm
+        //technology i.e. FEATURESIZE = 0.016). Using the DG process numbers for HP.
+        //16 nm HP
+        vdd[0] = 0.7;
+        Lphy[0] = 0.006;//Lphy is the physical gate-length.
+        Lelec[0] = 0.00315;//Lelec is the electrical gate-length.
+        t_ox[0] = 0.5e-3;//micron
+        v_th[0] = 0.1489;//V
+        c_ox[0] = 3.83e-14;//F/micron2 Cox_elec in MASTAR
+        mobility_eff[0] = 476.15 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[0] = 1.42e-2; //V/micron calculated in spreadsheet
+        c_g_ideal[0] = 2.30e-16;//F/micron
+        c_fringe[0] = 0.06e-15;//F/micron MASTAR inputdynamic/3
+        c_junc[0] = 0;//F/micron2 MASTAR result dynamic
+        I_on_n[0] =  2768.4e-6;//A/micron
+        I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier = 1.48;//nmos_effective_resistance_multiplier  is the ratio of Ieff to Idsat where Ieff is the effective NMOS current and Idsat is the saturation current.
+        n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+        //"Dynamic" tab of Device workspace.
+        gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value.
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+        long_channel_leakage_reduction[0] = 1/2.655;
+        I_off_n[0][0] = 1.52e-7/1.5*1.2*1.07;
+        I_off_n[0][10] = 1.55e-7/1.5*1.2*1.07;
+        I_off_n[0][20] = 1.59e-7/1.5*1.2*1.07;
+        I_off_n[0][30] = 1.68e-7/1.5*1.2*1.07;
+        I_off_n[0][40] = 1.90e-7/1.5*1.2*1.07;
+        I_off_n[0][50] = 2.69e-7/1.5*1.2*1.07;
+        I_off_n[0][60] = 5.32e-7/1.5*1.2*1.07;
+        I_off_n[0][70] = 1.02e-6/1.5*1.2*1.07;
+        I_off_n[0][80] = 1.62e-6/1.5*1.2*1.07;
+        I_off_n[0][90] = 2.73e-6/1.5*1.2*1.07;
+        I_off_n[0][100] = 6.1e-6/1.5*1.2*1.07;
+        //for 16nm DG HP
+        I_g_on_n[0][0]  = 1.07e-9;//A/micron
+        I_g_on_n[0][10] = 1.07e-9;
+        I_g_on_n[0][20] = 1.07e-9;
+        I_g_on_n[0][30] = 1.07e-9;
+        I_g_on_n[0][40] = 1.07e-9;
+        I_g_on_n[0][50] = 1.07e-9;
+        I_g_on_n[0][60] = 1.07e-9;
+        I_g_on_n[0][70] = 1.07e-9;
+        I_g_on_n[0][80] = 1.07e-9;
+        I_g_on_n[0][90] = 1.07e-9;
+        I_g_on_n[0][100] = 1.07e-9;
+
+//     //16 nm LSTP DG
+//     vdd[1] = 0.8;
+//     Lphy[1] = 0.014;
+//     Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+//     t_ox[1] = 1.1e-3;//micron
+//     v_th[1] = 0.40126;//V
+//     c_ox[1] = 2.30e-14;//F/micron2
+//     mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+//     Vdsat[1] = 6.64e-2; //V/micron
+//     c_g_ideal[1] = 3.22e-16;//F/micron
+//     c_fringe[1] = 0.008e-15;
+//     c_junc[1] = 0;//F/micron2
+//     I_on_n[1] = 727.6e-6;//A/micron
+//     I_on_p[1] = I_on_n[1] / 2;
+//     nmos_effective_resistance_multiplier = 1.99;
+//     n_to_p_eff_curr_drv_ratio[1] = 2;
+//     gmp_to_gmn_multiplier[1] = 0.99;
+//     Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron
+//     Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron
+//     I_off_n[1][0] = 2.43e-11;
+//     I_off_n[1][10] = 4.85e-11;
+//     I_off_n[1][20] = 9.68e-11;
+//     I_off_n[1][30] = 1.94e-10;
+//     I_off_n[1][40] = 3.87e-10;
+//     I_off_n[1][50] = 7.73e-10;
+//     I_off_n[1][60] = 3.55e-10;
+//     I_off_n[1][70] = 3.09e-9;
+//     I_off_n[1][80] = 6.19e-9;
+//     I_off_n[1][90] = 1.24e-8;
+//     I_off_n[1][100]= 2.48e-8;
+//
+//     //    for 22nm LSTP HP
+//     I_g_on_n[1][0]  = 4.51e-10;//A/micron
+//     I_g_on_n[1][10] = 4.51e-10;
+//     I_g_on_n[1][20] = 4.51e-10;
+//     I_g_on_n[1][30] = 4.51e-10;
+//     I_g_on_n[1][40] = 4.51e-10;
+//     I_g_on_n[1][50] = 4.51e-10;
+//     I_g_on_n[1][60] = 4.51e-10;
+//     I_g_on_n[1][70] = 4.51e-10;
+//     I_g_on_n[1][80] = 4.51e-10;
+//     I_g_on_n[1][90] = 4.51e-10;
+//     I_g_on_n[1][100] = 4.51e-10;
+
+
+        if (ram_cell_tech_type == 3)
+              {}
+        else if (ram_cell_tech_type == 4)
+        {
+        //22 nm commodity DRAM cell access transistor technology parameters.
+                //parameters
+                curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In
+                //2005 ITRS, the value was about twice the value in 2007 ITRS
+                Lphy[3] = 0.022;//micron
+                Lelec[3] = 0.0181;//micron.
+                curr_v_th_dram_access_transistor = 1;//V
+                width_dram_access_transistor = 0.022;//micron
+                curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always
+                //kept constant. In reality this could perhaps be lower
+                curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A
+                curr_Wmemcella_dram = width_dram_access_transistor;
+                curr_Wmemcellpmos_dram = 0;
+                curr_Wmemcellnmos_dram = 0;
+                curr_area_cell_dram = 6*0.022*0.022;//micron2.
+                curr_asp_ratio_cell_dram = 0.667;
+                curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus
+                //kept constant.
+
+        //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR.
+                curr_vpp = 2.3;//vpp. V
+                t_ox[3] = 3.5e-3;//micron
+                v_th[3] = 1.0;//V
+                c_ox[3] = 9.06e-15;//F/micron2
+                mobility_eff[3] =  367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs
+                Vdsat[3] = 0.0972; //V/micron
+                c_g_ideal[3] = 1.99e-16;//F/micron
+                c_fringe[3] = 0.053e-15;//F/micron
+                c_junc[3] = 1e-15;//F/micron2
+                I_on_n[3] = 910.5e-6;//A/micron
+                I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used.
+                nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm.
+                //
+                n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm
+                gmp_to_gmn_multiplier[3] = 0.90;
+                Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp  / I_on_n[3];//ohm-micron
+                Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron
+                long_channel_leakage_reduction[3] = 1;
+                I_off_n[3][0] = 1.1e-13; //A/micron
+                I_off_n[3][10] = 2.11e-13;
+                I_off_n[3][20] = 3.88e-13;
+                I_off_n[3][30] = 6.9e-13;
+                I_off_n[3][40] = 1.19e-12;
+                I_off_n[3][50] = 1.98e-12;
+                I_off_n[3][60] = 3.22e-12;
+                I_off_n[3][70] = 5.09e-12;
+                I_off_n[3][80] = 7.85e-12;
+                I_off_n[3][90] = 1.18e-11;
+                I_off_n[3][100] = 1.72e-11;
+
+        }
+        else
+        {
+          //some error handler
+        }
+
+        //SRAM cell properties
+        curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_sram = 1.46;
+        //CAM cell properties //TODO: data need to be revisited
+        curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_cam = 2.92;
+        //Empirical undifferetiated core/FU coefficient
+        curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7*0.7;
+        curr_core_tx_density      = 1.25/0.7/0.7/0.7;
+        curr_sckt_co_eff           = 1.1296;
+        curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+        curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+        }
+
+
+    g_tp.peri_global.Vdd       += curr_alpha * vdd[peri_global_tech_type];
+    g_tp.peri_global.t_ox      += curr_alpha * t_ox[peri_global_tech_type];
+    g_tp.peri_global.Vth       += curr_alpha * v_th[peri_global_tech_type];
+    g_tp.peri_global.C_ox      += curr_alpha * c_ox[peri_global_tech_type];
+    g_tp.peri_global.C_g_ideal += curr_alpha * c_g_ideal[peri_global_tech_type];
+    g_tp.peri_global.C_fringe  += curr_alpha * c_fringe[peri_global_tech_type];
+    g_tp.peri_global.C_junc    += curr_alpha * c_junc[peri_global_tech_type];
+    g_tp.peri_global.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.peri_global.l_phy     += curr_alpha * Lphy[peri_global_tech_type];
+    g_tp.peri_global.l_elec    += curr_alpha * Lelec[peri_global_tech_type];
+    g_tp.peri_global.I_on_n    += curr_alpha * I_on_n[peri_global_tech_type];
+    g_tp.peri_global.R_nch_on  += curr_alpha * Rnchannelon[peri_global_tech_type];
+    g_tp.peri_global.R_pch_on  += curr_alpha * Rpchannelon[peri_global_tech_type];
+    g_tp.peri_global.n_to_p_eff_curr_drv_ratio
+      += curr_alpha * n_to_p_eff_curr_drv_ratio[peri_global_tech_type];
+    g_tp.peri_global.long_channel_leakage_reduction
+      += curr_alpha * long_channel_leakage_reduction[peri_global_tech_type];
+    g_tp.peri_global.I_off_n   += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_off_p   += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_n   += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_p   += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    gmp_to_gmn_multiplier_periph_global += curr_alpha * gmp_to_gmn_multiplier[peri_global_tech_type];
+
+    g_tp.sram_cell.Vdd       += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.sram_cell.l_phy     += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.sram_cell.l_elec    += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.sram_cell.t_ox      += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.sram_cell.Vth       += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.sram_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.sram_cell.C_fringe  += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc    += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.sram_cell.I_on_n    += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.sram_cell.R_nch_on  += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.R_pch_on  += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.sram_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.sram_cell.I_off_n   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_off_p   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_n   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_p   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram_cell_Vdd      += curr_alpha * curr_vdd_dram_cell;
+    g_tp.dram_acc.Vth       += curr_alpha * curr_v_th_dram_access_transistor;
+    g_tp.dram_acc.l_phy     += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_acc.l_elec    += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_fringe  += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc    += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.dram_cell_I_on     += curr_alpha * curr_I_on_dram_cell;
+    g_tp.dram_cell_I_off_worst_case_len_temp += curr_alpha * curr_I_off_dram_cell_worst_case_length_temp;
+    g_tp.dram_acc.I_on_n    += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_cell_C        += curr_alpha * curr_c_dram_cell;
+    g_tp.vpp                += curr_alpha * curr_vpp;
+    g_tp.dram_wl.l_phy      += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_wl.l_elec     += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_g_ideal  += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_fringe   += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc     += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.dram_wl.I_on_n     += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_nch_on   += curr_alpha * Rnchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_pch_on   += curr_alpha * Rpchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[dram_cell_tech_flavor];
+    g_tp.dram_wl.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[dram_cell_tech_flavor];
+    g_tp.dram_wl.I_off_n    += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+    g_tp.dram_wl.I_off_p    += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+
+    g_tp.cam_cell.Vdd       += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.cam_cell.l_phy     += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.cam_cell.l_elec    += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.cam_cell.t_ox      += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.cam_cell.Vth       += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.cam_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.cam_cell.C_fringe  += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc    += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.cam_cell.I_on_n    += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.cam_cell.R_nch_on  += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.R_pch_on  += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.cam_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.cam_cell.I_off_n   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_off_p   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_n   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_p   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram.cell_a_w    += curr_alpha * curr_Wmemcella_dram;
+    g_tp.dram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_dram;
+    g_tp.dram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_dram;
+    area_cell_dram        += curr_alpha * curr_area_cell_dram;
+    asp_ratio_cell_dram   += curr_alpha * curr_asp_ratio_cell_dram;
+
+    g_tp.sram.cell_a_w    += curr_alpha * curr_Wmemcella_sram;
+    g_tp.sram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_sram;
+    g_tp.sram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_sram;
+    area_cell_sram += curr_alpha * curr_area_cell_sram;
+    asp_ratio_cell_sram += curr_alpha * curr_asp_ratio_cell_sram;
+
+    g_tp.cam.cell_a_w    += curr_alpha * curr_Wmemcella_cam;//sheng
+    g_tp.cam.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_cam;
+    g_tp.cam.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_cam;
+    area_cell_cam += curr_alpha * curr_area_cell_cam;
+    asp_ratio_cell_cam += curr_alpha * curr_asp_ratio_cell_cam;
+
+    //Sense amplifier latch Gm calculation
+    mobility_eff_periph_global += curr_alpha * mobility_eff[peri_global_tech_type];
+    Vdsat_periph_global += curr_alpha * Vdsat[peri_global_tech_type];
+
+    //Empirical undifferetiated core/FU coefficient
+    g_tp.scaling_factor.logic_scaling_co_eff += curr_alpha * curr_logic_scaling_co_eff;
+    g_tp.scaling_factor.core_tx_density += curr_alpha * curr_core_tx_density;
+    g_tp.chip_layout_overhead  += curr_alpha * curr_chip_layout_overhead;
+    g_tp.macro_layout_overhead += curr_alpha * curr_macro_layout_overhead;
+    g_tp.sckt_co_eff           += curr_alpha * curr_sckt_co_eff;
+  }
+
+
+  //Currently we are not modeling the resistance/capacitance of poly anywhere.
+  //Continuous function (or date have been processed) does not need linear interpolation
+  g_tp.w_comp_inv_p1 = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n1 =  7.5 * g_ip->F_sz_um;//this was  6 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p2 =   25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n2 =   15 * g_ip->F_sz_um;//this was 12 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p3 =   50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n3 =   30 * g_ip->F_sz_um;//this was 24 micron for the 0.8 micron process
+  g_tp.w_eval_inv_p  =  100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process
+  g_tp.w_eval_inv_n  =   50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_n     = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_p     = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process
+
+  g_tp.MIN_GAP_BET_P_AND_N_DIFFS = 5 * g_ip->F_sz_um;
+  g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS = 1.5 * g_ip->F_sz_um;
+  g_tp.HPOWERRAIL = 2 * g_ip->F_sz_um;
+  g_tp.cell_h_def = 50 * g_ip->F_sz_um;
+  g_tp.w_poly_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_poly = 1.5 * g_ip->F_sz_um;
+  g_tp.ram_wl_stitching_overhead_ = 7.5 * g_ip->F_sz_um;
+
+  g_tp.min_w_nmos_ = 3 * g_ip->F_sz_um / 2;
+  g_tp.max_w_nmos_ = 100 * g_ip->F_sz_um;
+  g_tp.w_iso       = 12.5*g_ip->F_sz_um;//was 10 micron for the 0.8 micron process
+  g_tp.w_sense_n   = 3.75*g_ip->F_sz_um; // sense amplifier N-trans; was 3 micron for the 0.8 micron process
+  g_tp.w_sense_p   = 7.5*g_ip->F_sz_um; // sense amplifier P-trans; was 6 micron for the 0.8 micron process
+  g_tp.w_sense_en  = 5*g_ip->F_sz_um; // Sense enable transistor of the sense amplifier; was 4 micron for the 0.8 micron process
+  g_tp.w_nmos_b_mux  = 6 * g_tp.min_w_nmos_;
+  g_tp.w_nmos_sa_mux = 6 * g_tp.min_w_nmos_;
+
+  if (ram_cell_tech_type == comm_dram)
+  {
+    g_tp.max_w_nmos_dec = 8 * g_ip->F_sz_um;
+    g_tp.h_dec          = 8;  // in the unit of memory cell height
+  }
+  else
+  {
+    g_tp.max_w_nmos_dec = g_tp.max_w_nmos_;
+    g_tp.h_dec          = 4;  // in the unit of memory cell height
+  }
+
+  g_tp.peri_global.C_overlap = 0.2 * g_tp.peri_global.C_g_ideal;
+  g_tp.sram_cell.C_overlap   = 0.2 * g_tp.sram_cell.C_g_ideal;
+  g_tp.cam_cell.C_overlap    = 0.2 * g_tp.cam_cell.C_g_ideal;
+
+  g_tp.dram_acc.C_overlap = 0.2 * g_tp.dram_acc.C_g_ideal;
+  g_tp.dram_acc.R_nch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_n;
+  //g_tp.dram_acc.R_pch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_p;
+
+  g_tp.dram_wl.C_overlap = 0.2 * g_tp.dram_wl.C_g_ideal;
+
+  double gmn_sense_amp_latch = (mobility_eff_periph_global / 2) * g_tp.peri_global.C_ox * (g_tp.w_sense_n / g_tp.peri_global.l_elec) * Vdsat_periph_global;
+  double gmp_sense_amp_latch = gmp_to_gmn_multiplier_periph_global * gmn_sense_amp_latch;
+  g_tp.gm_sense_amp_latch = gmn_sense_amp_latch + gmp_sense_amp_latch;
+
+  g_tp.dram.b_w = sqrt(area_cell_dram / (asp_ratio_cell_dram));
+  g_tp.dram.b_h = asp_ratio_cell_dram * g_tp.dram.b_w;
+  g_tp.sram.b_w = sqrt(area_cell_sram / (asp_ratio_cell_sram));
+  g_tp.sram.b_h = asp_ratio_cell_sram * g_tp.sram.b_w;
+  g_tp.cam.b_w =  sqrt(area_cell_cam / (asp_ratio_cell_cam));//Sheng
+  g_tp.cam.b_h = asp_ratio_cell_cam * g_tp.cam.b_w;
+
+  g_tp.dram.Vbitpre = g_tp.dram_cell_Vdd;
+  g_tp.sram.Vbitpre = vdd[ram_cell_tech_type];
+  g_tp.cam.Vbitpre = vdd[ram_cell_tech_type];//Sheng
+  pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  g_tp.w_pmos_bl_precharge = 6 * pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+  g_tp.w_pmos_bl_eq = pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+
+
+  double wire_pitch       [NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         wire_r_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         wire_c_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         horiz_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         vert_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         aspect_ratio[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         miller_value[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         ild_thickness[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES];
+
+  for (iter=0; iter<=1; ++iter)
+  {
+    // linear interpolation
+    if (iter == 0)
+    {
+      tech = tech_lo;
+      if (tech_lo == tech_hi)
+      {
+        curr_alpha = 1;
+      }
+      else
+      {
+        curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi);
+      }
+    }
+    else
+    {
+      tech = tech_hi;
+      if (tech_lo == tech_hi)
+      {
+        break;
+      }
+      else
+      {
+        curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 180)
+    {
+        //Aggressive projections
+        wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron
+        aspect_ratio[0][0] = 2.0;
+        wire_width = wire_pitch[0][0] / 2; //micron
+        wire_thickness = aspect_ratio[0][0] * wire_width;//micron
+        wire_spacing = wire_pitch[0][0] - wire_width;//micron
+        barrier_thickness = 0.017;//micron
+        dishing_thickness = 0;//micron
+        alpha_scatter = 1;
+        wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron
+        ild_thickness[0][0] = 0.75;//micron
+        miller_value[0][0] = 1.5;
+        horiz_dielectric_constant[0][0] = 2.709;
+        vert_dielectric_constant[0][0] = 3.9;
+        fringe_cap = 0.115e-15; //F/micron
+        wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0],
+          fringe_cap);//F/micron.
+
+        wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+        wire_width = wire_pitch[0][1] / 2;
+        aspect_ratio[0][1] = 2.4;
+        wire_thickness = aspect_ratio[0][1] * wire_width;
+        wire_spacing = wire_pitch[0][1] - wire_width;
+        wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+        ild_thickness[0][1] = 0.75;//micron
+        miller_value[0][1] = 1.5;
+        horiz_dielectric_constant[0][1] = 2.709;
+        vert_dielectric_constant[0][1] = 3.9;
+        fringe_cap = 0.115e-15; //F/micron
+        wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+        wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+        aspect_ratio[0][2] = 2.2;
+        wire_width = wire_pitch[0][2] / 2;
+        wire_thickness = aspect_ratio[0][2] * wire_width;
+        wire_spacing = wire_pitch[0][2] - wire_width;
+        wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+        ild_thickness[0][2] = 1.5;
+        miller_value[0][2] = 1.5;
+        horiz_dielectric_constant[0][2] = 2.709;
+        vert_dielectric_constant[0][2] = 3.9;
+        wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+        //Conservative projections
+        wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+        aspect_ratio[1][0]= 2.0;
+        wire_width = wire_pitch[1][0] / 2;
+        wire_thickness = aspect_ratio[1][0] * wire_width;
+        wire_spacing = wire_pitch[1][0] - wire_width;
+        barrier_thickness = 0.017;
+        dishing_thickness = 0;
+        alpha_scatter = 1;
+        wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+        ild_thickness[1][0] = 0.75;
+        miller_value[1][0] = 1.5;
+        horiz_dielectric_constant[1][0] = 3.038;
+        vert_dielectric_constant[1][0] = 3.9;
+        fringe_cap = 0.115e-15;
+        wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0],
+          fringe_cap);
+
+        wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+        wire_width = wire_pitch[1][1] / 2;
+        aspect_ratio[1][1] = 2.0;
+        wire_thickness = aspect_ratio[1][1] * wire_width;
+        wire_spacing = wire_pitch[1][1] - wire_width;
+        wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+        ild_thickness[1][1] = 0.75;
+        miller_value[1][1] = 1.5;
+        horiz_dielectric_constant[1][1] = 3.038;
+        vert_dielectric_constant[1][1] = 3.9;
+        wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1],
+          fringe_cap);
+
+        wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+        aspect_ratio[1][2] = 2.2;
+        wire_width = wire_pitch[1][2] / 2;
+        wire_thickness = aspect_ratio[1][2] * wire_width;
+        wire_spacing = wire_pitch[1][2] - wire_width;
+        dishing_thickness = 0.1 *  wire_thickness;
+        wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+        ild_thickness[1][2]  = 1.98;
+        miller_value[1][2]  = 1.5;
+        horiz_dielectric_constant[1][2]  = 3.038;
+        vert_dielectric_constant[1][2]  = 3.9;
+        wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+        //Nominal projections for commodity DRAM wordline/bitline
+        wire_pitch[1][3] = 2 * 0.18;
+        wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.18);
+        wire_r_per_micron[1][3] = 12 / 0.18;
+    }
+    else if (tech == 90)
+    {
+      //Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron
+      aspect_ratio[0][0] = 2.4;
+      wire_width = wire_pitch[0][0] / 2; //micron
+      wire_thickness = aspect_ratio[0][0] * wire_width;//micron
+      wire_spacing = wire_pitch[0][0] - wire_width;//micron
+      barrier_thickness = 0.01;//micron
+      dishing_thickness = 0;//micron
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron
+      ild_thickness[0][0] = 0.48;//micron
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 2.709;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15; //F/micron
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0],
+          fringe_cap);//F/micron.
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 2.4;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.48;//micron
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 2.709;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.7;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.96;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 2.709;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0]  = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.008;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0]  = 0.48;
+      miller_value[1][0]  = 1.5;
+      horiz_dielectric_constant[1][0]  = 3.038;
+      vert_dielectric_constant[1][0]  = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1]  = 0.48;
+      miller_value[1][1]  = 1.5;
+      horiz_dielectric_constant[1][1]  = 3.038;
+      vert_dielectric_constant[1][1]  = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2]  = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2]  = 1.1;
+      miller_value[1][2]  = 1.5;
+      horiz_dielectric_constant[1][2]  = 3.038;
+      vert_dielectric_constant[1][2]  = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.09;
+      wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.09);
+      wire_r_per_micron[1][3] = 12 / 0.09;
+    }
+    else if (tech == 65)
+    {
+      //Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0]  = 2.7;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0]  * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0]  = 0.405;
+      miller_value[0][0]   = 1.5;
+      horiz_dielectric_constant[0][0]  = 2.303;
+      vert_dielectric_constant[0][0]   = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] ,
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1]  = 2.7;
+      wire_thickness = aspect_ratio[0][1]  * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1]  = 0.405;
+      miller_value[0][1]   = 1.5;
+      horiz_dielectric_constant[0][1]  = 2.303;
+      vert_dielectric_constant[0][1]   = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.8;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.81;
+      miller_value[0][2]   = 1.5;
+      horiz_dielectric_constant[0][2]  = 2.303;
+      vert_dielectric_constant[0][2]   = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.006;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.405;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.734;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.405;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.734;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.77;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.734;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.065;
+      wire_c_per_micron[1][3] = 52.5e-15 / (256 * 2 * 0.065);
+      wire_r_per_micron[1][3] = 12 / 0.065;
+    }
+    else if (tech == 45)
+    {
+      //Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0]  = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0]  * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0]  = 0.315;
+      miller_value[0][0]  = 1.5;
+      horiz_dielectric_constant[0][0]  = 1.958;
+      vert_dielectric_constant[0][0]  = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] ,
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1]  = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1]  = 0.315;
+      miller_value[0][1]  = 1.5;
+      horiz_dielectric_constant[0][1]  = 1.958;
+      vert_dielectric_constant[0][1]  = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.63;
+      miller_value[0][2]  = 1.5;
+      horiz_dielectric_constant[0][2]  = 1.958;
+      vert_dielectric_constant[0][2]  = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.004;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.315;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.46;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.315;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.46;
+      vert_dielectric_constant[1][1] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.55;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.46;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.045;
+      wire_c_per_micron[1][3] = 37.5e-15 / (256 * 2 * 0.045);
+      wire_r_per_micron[1][3] = 12 / 0.045;
+    }
+    else if (tech == 32)
+    {
+      //Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.21;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.664;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.21;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.664;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.42;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.664;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.003;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.21;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.214;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      aspect_ratio[1][1] = 2.0;
+      wire_width = wire_pitch[1][1] / 2;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.21;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.214;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.385;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.214;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.032;//micron
+      wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.032);//F/micron
+      wire_r_per_micron[1][3] = 12 / 0.032;//ohm/micron
+    }
+    else if (tech == 22)
+        {
+          //Aggressive projections.
+          wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local
+          aspect_ratio[0][0] = 3.0;
+          wire_width = wire_pitch[0][0] / 2;
+          wire_thickness = aspect_ratio[0][0] * wire_width;
+          wire_spacing = wire_pitch[0][0] - wire_width;
+          barrier_thickness = 0;
+          dishing_thickness = 0;
+          alpha_scatter = 1;
+          wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][0] = 0.15;
+          miller_value[0][0] = 1.5;
+          horiz_dielectric_constant[0][0] = 1.414;
+          vert_dielectric_constant[0][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+            fringe_cap);
+
+          wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global
+          wire_width = wire_pitch[0][1] / 2;
+          aspect_ratio[0][1] = 3.0;
+          wire_thickness = aspect_ratio[0][1] * wire_width;
+          wire_spacing = wire_pitch[0][1] - wire_width;
+          wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][1] = 0.15;
+          miller_value[0][1] = 1.5;
+          horiz_dielectric_constant[0][1] = 1.414;
+          vert_dielectric_constant[0][1] = 3.9;
+          wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+            fringe_cap);
+
+          wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global
+          aspect_ratio[0][2] = 3.0;
+          wire_width = wire_pitch[0][2] / 2;
+          wire_thickness = aspect_ratio[0][2] * wire_width;
+          wire_spacing = wire_pitch[0][2] - wire_width;
+          wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+                          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][2] = 0.3;
+          miller_value[0][2] = 1.5;
+          horiz_dielectric_constant[0][2] = 1.414;
+          vert_dielectric_constant[0][2] = 3.9;
+          wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+                          fringe_cap);
+
+//          //*************************
+//          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][4] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][4] - wire_width;
+//          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][5] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][5] - wire_width;
+//          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][6] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][6] - wire_width;
+//          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+          //*************************
+
+          //Conservative projections
+          wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+          aspect_ratio[1][0] = 2.0;
+          wire_width = wire_pitch[1][0] / 2;
+          wire_thickness = aspect_ratio[1][0] * wire_width;
+          wire_spacing = wire_pitch[1][0] - wire_width;
+          barrier_thickness = 0.003;
+          dishing_thickness = 0;
+          alpha_scatter = 1.05;
+          wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][0] = 0.15;
+          miller_value[1][0] = 1.5;
+          horiz_dielectric_constant[1][0] = 2.104;
+          vert_dielectric_constant[1][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+            fringe_cap);
+
+          wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+          wire_width = wire_pitch[1][1] / 2;
+          aspect_ratio[1][1] = 2.0;
+          wire_thickness = aspect_ratio[1][1] * wire_width;
+          wire_spacing = wire_pitch[1][1] - wire_width;
+          wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][1] = 0.15;
+          miller_value[1][1] = 1.5;
+          horiz_dielectric_constant[1][1] = 2.104;
+          vert_dielectric_constant[1][1] = 3.9;
+          wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+            fringe_cap);
+
+            wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+            aspect_ratio[1][2] = 2.2;
+            wire_width = wire_pitch[1][2] / 2;
+            wire_thickness = aspect_ratio[1][2] * wire_width;
+            wire_spacing = wire_pitch[1][2] - wire_width;
+            dishing_thickness = 0.1 *  wire_thickness;
+            wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+            ild_thickness[1][2] = 0.275;
+            miller_value[1][2] = 1.5;
+            horiz_dielectric_constant[1][2] = 2.104;
+            vert_dielectric_constant[1][2] = 3.9;
+            wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                        ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+                        fringe_cap);
+            //Nominal projections for commodity DRAM wordline/bitline
+            wire_pitch[1][3] = 2 * 0.022;//micron
+            wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.022);//F/micron
+            wire_r_per_micron[1][3] = 12 / 0.022;//ohm/micron
+
+            //******************
+//            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][4] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][4] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][5] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][5] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][6] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][6] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+        }
+
+    else if (tech == 16)
+        {
+          //Aggressive projections.
+          wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local
+          aspect_ratio[0][0] = 3.0;
+          wire_width = wire_pitch[0][0] / 2;
+          wire_thickness = aspect_ratio[0][0] * wire_width;
+          wire_spacing = wire_pitch[0][0] - wire_width;
+          barrier_thickness = 0;
+          dishing_thickness = 0;
+          alpha_scatter = 1;
+          wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][0] = 0.108;
+          miller_value[0][0] = 1.5;
+          horiz_dielectric_constant[0][0] = 1.202;
+          vert_dielectric_constant[0][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+            fringe_cap);
+
+          wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global
+          aspect_ratio[0][1] = 3.0;
+          wire_width = wire_pitch[0][1] / 2;
+          wire_thickness = aspect_ratio[0][1] * wire_width;
+          wire_spacing = wire_pitch[0][1] - wire_width;
+          wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][1] = 0.108;
+          miller_value[0][1] = 1.5;
+          horiz_dielectric_constant[0][1] = 1.202;
+          vert_dielectric_constant[0][1] = 3.9;
+          wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+            fringe_cap);
+
+          wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global
+          aspect_ratio[0][2] = 3.0;
+          wire_width = wire_pitch[0][2] / 2;
+          wire_thickness = aspect_ratio[0][2] * wire_width;
+          wire_spacing = wire_pitch[0][2] - wire_width;
+          wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+                          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][2] = 0.216;
+          miller_value[0][2] = 1.5;
+          horiz_dielectric_constant[0][2] = 1.202;
+          vert_dielectric_constant[0][2] = 3.9;
+          wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+                          fringe_cap);
+
+//          //*************************
+//          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][4] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][4] - wire_width;
+//          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][5] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][5] - wire_width;
+//          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][6] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][6] - wire_width;
+//          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+          //*************************
+
+          //Conservative projections
+          wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+          aspect_ratio[1][0] = 2.0;
+          wire_width = wire_pitch[1][0] / 2;
+          wire_thickness = aspect_ratio[1][0] * wire_width;
+          wire_spacing = wire_pitch[1][0] - wire_width;
+          barrier_thickness = 0.002;
+          dishing_thickness = 0;
+          alpha_scatter = 1.05;
+          wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][0] = 0.108;
+          miller_value[1][0] = 1.5;
+          horiz_dielectric_constant[1][0] = 1.998;
+          vert_dielectric_constant[1][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+            fringe_cap);
+
+          wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+          wire_width = wire_pitch[1][1] / 2;
+          aspect_ratio[1][1] = 2.0;
+          wire_thickness = aspect_ratio[1][1] * wire_width;
+          wire_spacing = wire_pitch[1][1] - wire_width;
+          wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][1] = 0.108;
+          miller_value[1][1] = 1.5;
+          horiz_dielectric_constant[1][1] = 1.998;
+          vert_dielectric_constant[1][1] = 3.9;
+            wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+            fringe_cap);
+
+            wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+            aspect_ratio[1][2] = 2.2;
+            wire_width = wire_pitch[1][2] / 2;
+            wire_thickness = aspect_ratio[1][2] * wire_width;
+            wire_spacing = wire_pitch[1][2] - wire_width;
+            dishing_thickness = 0.1 *  wire_thickness;
+            wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+            ild_thickness[1][2] = 0.198;
+            miller_value[1][2] = 1.5;
+            horiz_dielectric_constant[1][2] = 1.998;
+            vert_dielectric_constant[1][2] = 3.9;
+            wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                        ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+                        fringe_cap);
+            //Nominal projections for commodity DRAM wordline/bitline
+            wire_pitch[1][3] = 2 * 0.016;//micron
+            wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.016);//F/micron
+            wire_r_per_micron[1][3] = 12 / 0.016;//ohm/micron
+
+            //******************
+//            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][4] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][4] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][5] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][5] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][6] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][6] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+        }
+    g_tp.wire_local.pitch    += curr_alpha * wire_pitch[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.R_per_um += curr_alpha * wire_r_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.C_per_um += curr_alpha * wire_c_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+
+    g_tp.wire_inside_mat.pitch     += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.R_per_um  += curr_alpha* wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.C_per_um  += curr_alpha* wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+
+    g_tp.wire_outside_mat.pitch    += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.R_per_um += curr_alpha*wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.C_per_um += curr_alpha*wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+
+    g_tp.unit_len_wire_del = g_tp.wire_inside_mat.R_per_um * g_tp.wire_inside_mat.C_per_um / 2;
+
+    g_tp.sense_delay               += curr_alpha *SENSE_AMP_D;
+    g_tp.sense_dy_power            += curr_alpha *SENSE_AMP_P;
+//    g_tp.horiz_dielectric_constant += horiz_dielectric_constant;
+//    g_tp.vert_dielectric_constant  += vert_dielectric_constant;
+//    g_tp.aspect_ratio              += aspect_ratio;
+//    g_tp.miller_value              += miller_value;
+//    g_tp.ild_thickness             += ild_thickness;
+
+  }
+  g_tp.fringe_cap = fringe_cap;
+
+  double rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  double p_to_n_sizing_r = pmos_to_nmos_sz_ratio();
+  double c_load = gate_C(g_tp.min_w_nmos_ * (1 + p_to_n_sizing_r), 0.0);
+  double tf = rd * c_load;
+  g_tp.kinv = horowitz(0, tf, 0.5, 0.5, RISE);
+  double KLOAD = 1;
+  c_load = KLOAD * (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+                    drain_C_(g_tp.min_w_nmos_ * p_to_n_sizing_r, PCH, 1, 1, g_tp.cell_h_def) +
+                    gate_C(g_tp.min_w_nmos_ * 4 * (1 + p_to_n_sizing_r), 0.0));
+  tf = rd * c_load;
+  g_tp.FO4 = horowitz(0, tf, 0.5, 0.5, RISE);
+}
+
diff --git a/ext/mcpat/cacti/uca.cc b/ext/mcpat/cacti/uca.cc
new file mode 100755 (executable)
index 0000000..568cd9e
--- /dev/null
@@ -0,0 +1,426 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <cmath>
+#include <iostream>
+
+#include "uca.h"
+
+UCA::UCA(const DynamicParameter & dyn_p)
+ :dp(dyn_p), bank(dp), nbanks(g_ip->nbanks), refresh_power(0)
+{
+  int num_banks_ver_dir = 1 << ((bank.area.h > bank.area.w) ? _log2(nbanks)/2 : (_log2(nbanks) - _log2(nbanks)/2));
+  int num_banks_hor_dir = nbanks/num_banks_ver_dir;
+
+  if (dp.use_inp_params)
+  {
+          RWP  = dp.num_rw_ports;
+          ERP  = dp.num_rd_ports;
+          EWP  = dp.num_wr_ports;
+          SCHP = dp.num_search_ports;
+  }
+  else
+  {
+          RWP  = g_ip->num_rw_ports;
+          ERP  = g_ip->num_rd_ports;
+          EWP  = g_ip->num_wr_ports;
+          SCHP = g_ip->num_search_ports;
+  }
+
+  num_addr_b_bank = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
+  num_di_b_bank   = dp.num_di_b_bank_per_port * (RWP + EWP);
+  num_do_b_bank   = dp.num_do_b_bank_per_port * (RWP + ERP);
+  num_si_b_bank   = dp.num_si_b_bank_per_port * SCHP;
+  num_so_b_bank   = dp.num_so_b_bank_per_port * SCHP;
+
+  if (!dp.fully_assoc && !dp.pure_cam)
+  {
+
+          if (g_ip->fast_access && dp.is_tag == false)
+          {
+                  num_do_b_bank *= g_ip->data_assoc;
+          }
+
+          htree_in_add   = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank,0, num_do_b_bank,0,num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
+          htree_in_data  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
+          htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
+  }
+
+  else
+  {
+
+          htree_in_add   = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank, num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
+          htree_in_data  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
+          htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
+          htree_in_search  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
+          htree_out_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
+  }
+
+  area.w = htree_in_data->area.w;
+  area.h = htree_in_data->area.h;
+
+  area_all_dataramcells = bank.mat.subarray.get_total_cell_area() * dp.num_subarrays * g_ip->nbanks;
+//  cout<<"area cell"<<area_all_dataramcells<<endl;
+//  cout<<area.get_area()<<endl;
+  // delay calculation
+  double inrisetime = 0.0;
+  compute_delays(inrisetime);
+  compute_power_energy();
+}
+
+
+
+UCA::~UCA()
+{
+  delete htree_in_add;
+  delete htree_in_data;
+  delete htree_out_data;
+}
+
+
+
+double UCA::compute_delays(double inrisetime)
+{
+  double outrisetime = bank.compute_delays(inrisetime);
+
+  double delay_array_to_mat = htree_in_add->delay + bank.htree_in_add->delay;
+  double max_delay_before_row_decoder = delay_array_to_mat + bank.mat.r_predec->delay;
+  delay_array_to_sa_mux_lev_1_decoder = delay_array_to_mat +
+    bank.mat.sa_mux_lev_1_predec->delay +
+    bank.mat.sa_mux_lev_1_dec->delay;
+  delay_array_to_sa_mux_lev_2_decoder = delay_array_to_mat +
+    bank.mat.sa_mux_lev_2_predec->delay +
+    bank.mat.sa_mux_lev_2_dec->delay;
+  double delay_inside_mat = bank.mat.row_dec->delay + bank.mat.delay_bitline + bank.mat.delay_sa;
+
+  delay_before_subarray_output_driver =
+    MAX(MAX(max_delay_before_row_decoder + delay_inside_mat,  // row_path
+            delay_array_to_mat + bank.mat.b_mux_predec->delay + bank.mat.bit_mux_dec->delay + bank.mat.delay_sa),  // col_path
+        MAX(delay_array_to_sa_mux_lev_1_decoder,    // sa_mux_lev_1_path
+            delay_array_to_sa_mux_lev_2_decoder));  // sa_mux_lev_2_path
+  delay_from_subarray_out_drv_to_out = bank.mat.delay_subarray_out_drv_htree +
+                                       bank.htree_out_data->delay + htree_out_data->delay;
+  access_time                        = bank.mat.delay_comparator;
+
+  double ram_delay_inside_mat;
+  if (dp.fully_assoc)
+  {
+    //delay of FA contains both CAM tag and RAM data
+    { //delay of CAM
+      ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
+      access_time = htree_in_add->delay + bank.htree_in_add->delay;
+      //delay of fully-associative data array
+      access_time += ram_delay_inside_mat + delay_from_subarray_out_drv_to_out;
+    }
+  }
+  else
+  {
+    access_time = delay_before_subarray_output_driver + delay_from_subarray_out_drv_to_out; //data_acc_path
+  }
+
+  if (dp.is_main_mem)
+  {
+    double t_rcd       = max_delay_before_row_decoder + delay_inside_mat;
+    double cas_latency = MAX(delay_array_to_sa_mux_lev_1_decoder, delay_array_to_sa_mux_lev_2_decoder) +
+                         delay_from_subarray_out_drv_to_out;
+    access_time = t_rcd + cas_latency;
+  }
+
+  double temp;
+
+  if (!dp.fully_assoc)
+  {
+    temp = delay_inside_mat + bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;//TODO: Sheng: revisit
+   if (dp.is_dram)
+    {
+      temp += bank.mat.delay_writeback;  // temp stores random cycle time
+    }
+
+
+  temp = MAX(temp, bank.mat.r_predec->delay);
+  temp = MAX(temp, bank.mat.b_mux_predec->delay);
+  temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
+  temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
+  }
+  else
+   {
+          ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
+          temp = ram_delay_inside_mat + bank.mat.delay_cam_sl_restore + bank.mat.delay_cam_ml_reset + bank.mat.delay_bl_restore
+                 + bank.mat.delay_hit_miss_reset + bank.mat.delay_wl_reset;
+
+          temp = MAX(temp, bank.mat.b_mux_predec->delay);//TODO: Sheng revisit whether distinguish cam and ram bitline etc.
+          temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
+          temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
+   }
+
+  // The following is true only if the input parameter "repeaters_in_htree" is set to false --Nav
+  if (g_ip->rpters_in_htree == false)
+  {
+    temp = MAX(temp, bank.htree_in_add->max_unpipelined_link_delay);
+  }
+  cycle_time = temp;
+
+  double delay_req_network = max_delay_before_row_decoder;
+  double delay_rep_network = delay_from_subarray_out_drv_to_out;
+  multisubbank_interleave_cycle_time = MAX(delay_req_network, delay_rep_network);
+
+  if (dp.is_main_mem)
+  {
+    multisubbank_interleave_cycle_time = htree_in_add->delay;
+    precharge_delay = htree_in_add->delay +
+                      bank.htree_in_add->delay + bank.mat.delay_writeback +
+                      bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;
+    cycle_time = access_time + precharge_delay;
+  }
+  else
+  {
+    precharge_delay = 0;
+  }
+
+  double dram_array_availability = 0;
+  if (dp.is_dram)
+  {
+    dram_array_availability = (1 - dp.num_r_subarray * cycle_time / dp.dram_refresh_period) * 100;
+  }
+
+  return outrisetime;
+}
+
+
+
+// note: currently, power numbers are for a bank of an array
+void UCA::compute_power_energy()
+{
+  bank.compute_power_energy();
+  power = bank.power;
+
+  power_routing_to_bank.readOp.dynamic  = htree_in_add->power.readOp.dynamic + htree_out_data->power.readOp.dynamic;
+  power_routing_to_bank.writeOp.dynamic = htree_in_add->power.readOp.dynamic + htree_in_data->power.readOp.dynamic;
+  if (dp.fully_assoc || dp.pure_cam)
+      power_routing_to_bank.searchOp.dynamic= htree_in_search->power.searchOp.dynamic + htree_out_search->power.searchOp.dynamic;
+
+  power_routing_to_bank.readOp.leakage += htree_in_add->power.readOp.leakage +
+                                          htree_in_data->power.readOp.leakage +
+                                          htree_out_data->power.readOp.leakage;
+
+  power_routing_to_bank.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage +
+                                          htree_in_data->power.readOp.gate_leakage +
+                                          htree_out_data->power.readOp.gate_leakage;
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+        power_routing_to_bank.readOp.leakage += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
+        power_routing_to_bank.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
+  }
+
+  power.searchOp.dynamic += power_routing_to_bank.searchOp.dynamic;
+  power.readOp.dynamic += power_routing_to_bank.readOp.dynamic;
+  power.readOp.leakage += power_routing_to_bank.readOp.leakage;
+  power.readOp.gate_leakage += power_routing_to_bank.readOp.gate_leakage;
+
+  // calculate total write energy per access
+  power.writeOp.dynamic = power.readOp.dynamic
+                        - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
+                        + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
+                        - power_routing_to_bank.readOp.dynamic
+                        + power_routing_to_bank.writeOp.dynamic
+                        + bank.htree_in_data->power.readOp.dynamic
+                        - bank.htree_out_data->power.readOp.dynamic;
+
+  if (dp.is_dram == false)
+  {
+    power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
+  }
+
+  dyn_read_energy_from_closed_page = power.readOp.dynamic;
+  dyn_read_energy_from_open_page   = power.readOp.dynamic -
+                                     (bank.mat.r_predec->power.readOp.dynamic +
+                                      bank.mat.power_row_decoders.readOp.dynamic +
+                                      bank.mat.power_bl_precharge_eq_drv.readOp.dynamic +
+                                      bank.mat.power_sa.readOp.dynamic +
+                                      bank.mat.power_bitline.readOp.dynamic) * dp.num_act_mats_hor_dir;
+
+  dyn_read_energy_remaining_words_in_burst =
+    (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1) *
+    ((bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic +
+      bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic +
+      bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
+      bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
+      bank.mat.power_subarray_out_drv.readOp.dynamic)     * dp.num_act_mats_hor_dir +
+     bank.htree_out_data->power.readOp.dynamic +
+     power_routing_to_bank.readOp.dynamic);
+  dyn_read_energy_from_closed_page += dyn_read_energy_remaining_words_in_burst;
+  dyn_read_energy_from_open_page   += dyn_read_energy_remaining_words_in_burst;
+
+  activate_energy = htree_in_add->power.readOp.dynamic +
+                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_act +
+                    (bank.mat.r_predec->power.readOp.dynamic +
+                     bank.mat.power_row_decoders.readOp.dynamic +
+                     bank.mat.power_sa.readOp.dynamic) * dp.num_act_mats_hor_dir;
+  read_energy    = (htree_in_add->power.readOp.dynamic +
+                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
+                    (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic  +
+                     bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic  +
+                     bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
+                     bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
+                     bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir +
+                    bank.htree_out_data->power.readOp.dynamic +
+                    htree_in_data->power.readOp.dynamic) * g_ip->burst_len;
+  write_energy   = (htree_in_add->power.readOp.dynamic +
+                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
+                    htree_in_data->power.readOp.dynamic +
+                    bank.htree_in_data->power.readOp.dynamic +
+                    (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic  +
+                     bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic  +
+                     bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
+                     bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic) * dp.num_act_mats_hor_dir) * g_ip->burst_len;
+  precharge_energy = (bank.mat.power_bitline.readOp.dynamic +
+                      bank.mat.power_bl_precharge_eq_drv.readOp.dynamic) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_closed_page =
+    (bank.mat.r_predec->power.readOp.leakage +
+     bank.mat.b_mux_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
+     bank.mat.power_row_decoders.readOp.leakage +
+     bank.mat.power_bit_mux_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
+     bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_closed_page +=
+    (bank.mat.r_predec->power.readOp.gate_leakage +
+     bank.mat.b_mux_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
+     bank.mat.power_row_decoders.readOp.gate_leakage +
+     bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage) * dp.num_act_mats_hor_dir; //+
+     //bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_open_page =
+    (bank.mat.r_predec->power.readOp.leakage +
+     bank.mat.b_mux_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
+     bank.mat.power_row_decoders.readOp.leakage +
+     bank.mat.power_bit_mux_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
+     bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_open_page +=
+    (bank.mat.r_predec->power.readOp.gate_leakage +
+     bank.mat.b_mux_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
+     bank.mat.power_row_decoders.readOp.gate_leakage +
+     bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage ) * dp.num_act_mats_hor_dir;
+     //bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_request_and_reply_networks =
+    power_routing_to_bank.readOp.leakage +
+    bank.htree_in_add->power.readOp.leakage +
+    bank.htree_in_data->power.readOp.leakage +
+    bank.htree_out_data->power.readOp.leakage;
+
+  leak_power_request_and_reply_networks +=
+    power_routing_to_bank.readOp.gate_leakage +
+    bank.htree_in_add->power.readOp.gate_leakage +
+    bank.htree_in_data->power.readOp.gate_leakage +
+    bank.htree_out_data->power.readOp.gate_leakage;
+
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+        leak_power_request_and_reply_networks += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
+        leak_power_request_and_reply_networks += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
+  }
+
+
+  if (dp.is_dram)
+  { // if DRAM, add contribution of power spent in row predecoder drivers, blocks and decoders to refresh power
+    refresh_power  = (bank.mat.r_predec->power.readOp.dynamic * dp.num_act_mats_hor_dir +
+                      bank.mat.row_dec->power.readOp.dynamic) * dp.num_r_subarray * dp.num_subarrays;
+    refresh_power += bank.mat.per_bitline_read_energy * dp.num_c_subarray * dp.num_r_subarray * dp.num_subarrays;
+    refresh_power += bank.mat.power_bl_precharge_eq_drv.readOp.dynamic * dp.num_act_mats_hor_dir;
+    refresh_power += bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
+    refresh_power /= dp.dram_refresh_period;
+  }
+
+
+  if (dp.is_tag == false)
+  {
+    power.readOp.dynamic  = dyn_read_energy_from_closed_page;
+    power.writeOp.dynamic = dyn_read_energy_from_closed_page
+      - dyn_read_energy_remaining_words_in_burst
+      - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
+      + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
+      + (power_routing_to_bank.writeOp.dynamic -
+         power_routing_to_bank.readOp.dynamic -
+         bank.htree_out_data->power.readOp.dynamic +
+         bank.htree_in_data->power.readOp.dynamic) *
+        (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1); //FIXME
+
+    if (dp.is_dram == false)
+    {
+      power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
+    }
+  }
+
+  // if DRAM, add refresh power to total leakage
+  if (dp.is_dram)
+  {
+    power.readOp.leakage += refresh_power;
+  }
+
+  // TODO: below should be  avoided.
+  /*if (dp.is_main_mem)
+  {
+    power.readOp.leakage += MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA * 1e-3 * g_tp.peri_global.Vdd / g_ip->nbanks;
+  }*/
+
+  assert(power.readOp.dynamic  > 0);
+  assert(power.writeOp.dynamic > 0);
+  assert(power.readOp.leakage  > 0);
+}
+
diff --git a/ext/mcpat/cacti/uca.h b/ext/mcpat/cacti/uca.h
new file mode 100755 (executable)
index 0000000..fdab14f
--- /dev/null
@@ -0,0 +1,95 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __UCA_H__
+#define __UCA_H__
+
+#include "area.h"
+#include "bank.h"
+#include "component.h"
+#include "htree2.h"
+#include "parameter.h"
+
+class UCA : public Component
+{
+  public:
+    UCA(const DynamicParameter & dyn_p);
+    ~UCA();
+    double compute_delays(double inrisetime);  // returns outrisetime
+    void   compute_power_energy();
+
+    DynamicParameter dp;
+    Bank   bank;
+
+    Htree2   * htree_in_add;
+    Htree2   * htree_in_data;
+    Htree2   * htree_out_data;
+    Htree2   * htree_in_search;
+    Htree2   * htree_out_search;
+
+    powerDef power_routing_to_bank;
+
+    uint32_t nbanks;
+
+    int   num_addr_b_bank;
+    int   num_di_b_bank;
+    int   num_do_b_bank;
+    int   num_si_b_bank;
+    int   num_so_b_bank;
+    int   RWP, ERP, EWP,SCHP;
+    double area_all_dataramcells;
+
+    double dyn_read_energy_from_closed_page;
+    double dyn_read_energy_from_open_page;
+    double dyn_read_energy_remaining_words_in_burst;
+
+    double refresh_power;  // only for DRAM
+    double activate_energy;
+    double read_energy;
+    double write_energy;
+    double precharge_energy;
+    double leak_power_subbank_closed_page;
+    double leak_power_subbank_open_page;
+    double leak_power_request_and_reply_networks;
+
+    double delay_array_to_sa_mux_lev_1_decoder;
+    double delay_array_to_sa_mux_lev_2_decoder;
+    double delay_before_subarray_output_driver;
+    double delay_from_subarray_out_drv_to_out;
+    double access_time;
+    double precharge_delay;
+    double multisubbank_interleave_cycle_time;
+};
+
+#endif
+
diff --git a/ext/mcpat/cacti/wire.cc b/ext/mcpat/cacti/wire.cc
new file mode 100644 (file)
index 0000000..742000c
--- /dev/null
@@ -0,0 +1,832 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "wire.h"
+#include "cmath"
+// use this constructor to calculate wire stats
+Wire::Wire(
+    enum Wire_type wire_model,
+    double wl,
+    int n,
+    double w_s,
+    double s_s,
+    enum Wire_placement wp,
+    double resistivity,
+    TechnologyParameter::DeviceType *dt
+    ):wt(wire_model), wire_length(wl*1e-6), nsense(n), w_scale(w_s), s_scale(s_s),
+    resistivity(resistivity), deviceType(dt)
+{
+  wire_placement = wp;
+  min_w_pmos     = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  in_rise_time   = 0;
+  out_rise_time  = 0;
+  if (initialized != 1) {
+    cout << "Wire not initialized. Initializing it with default values\n";
+    Wire winit;
+  }
+  calculate_wire_stats();
+  // change everything back to seconds, microns, and Joules
+  repeater_spacing *= 1e6;
+  wire_length      *= 1e6;
+  wire_width       *= 1e6;
+  wire_spacing     *= 1e6;
+  assert(wire_length > 0);
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+}
+
+    // the following values are for peripheral global technology
+    // specified in the input config file
+    Component Wire::global;
+    Component Wire::global_5;
+    Component Wire::global_10;
+    Component Wire::global_20;
+    Component Wire::global_30;
+    Component Wire::low_swing;
+
+    int Wire::initialized;
+    double Wire::wire_width_init;
+    double Wire::wire_spacing_init;
+
+
+Wire::Wire(double w_s, double s_s, enum Wire_placement wp, double resis, TechnologyParameter::DeviceType *dt)
+{
+  w_scale        = w_s;
+  s_scale        = s_s;
+  deviceType     = dt;
+  wire_placement = wp;
+  resistivity    = resis;
+  min_w_pmos     = deviceType->n_to_p_eff_curr_drv_ratio * g_tp.min_w_nmos_;
+  in_rise_time   = 0;
+  out_rise_time  = 0;
+
+  switch (wire_placement)
+  {
+    case outside_mat: wire_width = g_tp.wire_outside_mat.pitch; break;
+    case inside_mat : wire_width = g_tp.wire_inside_mat.pitch;  break;
+    default:          wire_width = g_tp.wire_local.pitch; break;
+  }
+
+  wire_spacing = wire_width;
+
+  wire_width   *= (w_scale * 1e-6/2) /* (m) */;
+  wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
+
+  initialized = 1;
+  init_wire();
+  wire_width_init = wire_width;
+  wire_spacing_init = wire_spacing;
+
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+}
+
+
+
+Wire::~Wire()
+{
+}
+
+
+
+void
+Wire::calculate_wire_stats()
+{
+
+  if (wire_placement == outside_mat) {
+    wire_width = g_tp.wire_outside_mat.pitch;
+  }
+  else if (wire_placement == inside_mat) {
+    wire_width = g_tp.wire_inside_mat.pitch;
+  }
+  else {
+    wire_width = g_tp.wire_local.pitch;
+  }
+
+  wire_spacing = wire_width;
+
+  wire_width   *= (w_scale * 1e-6/2) /* (m) */;
+  wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
+
+
+  if (wt != Low_swing) {
+
+          //    delay_optimal_wire();
+
+          if (wt == Global) {
+                  delay = global.delay * wire_length;
+                  power.readOp.dynamic = global.power.readOp.dynamic * wire_length;
+                  power.readOp.leakage = global.power.readOp.leakage * wire_length;
+                  power.readOp.gate_leakage = global.power.readOp.gate_leakage * wire_length;
+                  repeater_spacing = global.area.w;
+                  repeater_size = global.area.h;
+                  area.set_area((wire_length/repeater_spacing) *
+                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+          }
+          else if (wt == Global_5) {
+                  delay = global_5.delay * wire_length;
+                  power.readOp.dynamic = global_5.power.readOp.dynamic * wire_length;
+                  power.readOp.leakage = global_5.power.readOp.leakage * wire_length;
+                  power.readOp.gate_leakage = global_5.power.readOp.gate_leakage * wire_length;
+                  repeater_spacing = global_5.area.w;
+                  repeater_size = global_5.area.h;
+                  area.set_area((wire_length/repeater_spacing) *
+                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+          }
+          else if (wt == Global_10) {
+                  delay = global_10.delay * wire_length;
+                  power.readOp.dynamic = global_10.power.readOp.dynamic * wire_length;
+                  power.readOp.leakage = global_10.power.readOp.leakage * wire_length;
+                  power.readOp.gate_leakage = global_10.power.readOp.gate_leakage * wire_length;
+                  repeater_spacing = global_10.area.w;
+                  repeater_size = global_10.area.h;
+                  area.set_area((wire_length/repeater_spacing) *
+                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+          }
+          else if (wt == Global_20) {
+                  delay = global_20.delay * wire_length;
+                  power.readOp.dynamic = global_20.power.readOp.dynamic * wire_length;
+                  power.readOp.leakage = global_20.power.readOp.leakage * wire_length;
+                  power.readOp.gate_leakage = global_20.power.readOp.gate_leakage * wire_length;
+                  repeater_spacing = global_20.area.w;
+                  repeater_size = global_20.area.h;
+                  area.set_area((wire_length/repeater_spacing) *
+                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+          }
+          else if (wt == Global_30) {
+                  delay = global_30.delay * wire_length;
+                  power.readOp.dynamic = global_30.power.readOp.dynamic * wire_length;
+                  power.readOp.leakage = global_30.power.readOp.leakage * wire_length;
+                  power.readOp.gate_leakage = global_30.power.readOp.gate_leakage * wire_length;
+                  repeater_spacing = global_30.area.w;
+                  repeater_size = global_30.area.h;
+                  area.set_area((wire_length/repeater_spacing) *
+                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+          }
+    out_rise_time = delay*repeater_spacing/deviceType->Vth;
+  }
+  else if (wt == Low_swing) {
+    low_swing_model ();
+    repeater_spacing = wire_length;
+    repeater_size = 1;
+  }
+  else {
+    assert(0);
+  }
+}
+
+
+
+/*
+ * The fall time of an input signal to the first stage of a circuit is
+ * assumed to be same as the fall time of the output signal of two
+ * inverters connected in series (refer: CACTI 1 Technical report,
+ * section 6.1.3)
+ */
+  double
+Wire::signal_fall_time ()
+{
+
+  /* rise time of inverter 1's output */
+  double rt;
+  /* fall time of inverter 2's output */
+  double ft;
+  double timeconst;
+
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(min_w_pmos, PCH, 1);
+  rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
+  return ft;
+}
+
+
+
+double Wire::signal_rise_time ()
+{
+
+  /* rise time of inverter 1's output */
+  double ft;
+  /* fall time of inverter 2's output */
+  double rt;
+  double timeconst;
+
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(min_w_pmos, PCH, 1);
+  ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
+  return ft; //sec
+}
+
+
+
+/* Wire resistance and capacitance calculations
+ *   wire width
+ *
+ *    /__/
+ *   |  |
+ *   |  |  height = ASPECT_RATIO*wire width (ASPECT_RATIO = 2.2, ref: ITRS)
+ *   |__|/
+ *
+ *   spacing between wires in same level = wire width
+ *   spacing between wires in adjacent levels = wire width---this is incorrect,
+ *   according to R.Ho's paper and thesis. ILD != wire width
+ *
+ */
+
+double Wire::wire_cap (double len /* in m */, bool call_from_outside)
+{
+        //TODO: this should be consistent with the wire_res in technology file
+  double sidewall, adj, tot_cap;
+  double wire_height;
+  double epsilon0 = 8.8542e-12;
+  double aspect_ratio, horiz_dielectric_constant, vert_dielectric_constant, miller_value,ild_thickness;
+
+  switch (wire_placement)
+  {
+    case outside_mat:
+        {
+                aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
+                horiz_dielectric_constant = g_tp.wire_outside_mat.horiz_dielectric_constant;
+                vert_dielectric_constant = g_tp.wire_outside_mat.vert_dielectric_constant;
+                miller_value = g_tp.wire_outside_mat.miller_value;
+                ild_thickness = g_tp.wire_outside_mat.ild_thickness;
+                break;
+        }
+    case inside_mat :
+        {
+                aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
+                horiz_dielectric_constant = g_tp.wire_inside_mat.horiz_dielectric_constant;
+                vert_dielectric_constant = g_tp.wire_inside_mat.vert_dielectric_constant;
+                miller_value = g_tp.wire_inside_mat.miller_value;
+                ild_thickness = g_tp.wire_inside_mat.ild_thickness;
+                break;
+        }
+    default:
+        {
+                aspect_ratio = g_tp.wire_local.aspect_ratio;
+                horiz_dielectric_constant = g_tp.wire_local.horiz_dielectric_constant;
+                vert_dielectric_constant = g_tp.wire_local.vert_dielectric_constant;
+                miller_value = g_tp.wire_local.miller_value;
+                ild_thickness = g_tp.wire_local.ild_thickness;
+                break;
+        }
+  }
+
+  if (call_from_outside)
+  {
+          wire_width       *= 1e-6;
+          wire_spacing     *= 1e-6;
+  }
+  wire_height = wire_width/w_scale*aspect_ratio;
+  /*
+   * assuming height does not change. wire_width = width_original*w_scale
+   * So wire_height does not change as wire width increases
+   */
+
+// capacitance between wires in the same level
+//  sidewall = 2*miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
+//    * epsilon0;
+
+  sidewall = miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
+    * epsilon0;
+
+
+  // capacitance between wires in adjacent levels
+  //adj = miller_value * vert_dielectric_constant *w_scale * epsilon0;
+  //adj = 2*vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
+
+  adj = miller_value *vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
+  //Change ild_thickness from micron to M
+
+  //tot_cap =  (sidewall + adj + (deviceType->C_fringe * 1e6)); //F/m
+  tot_cap =  (sidewall + adj + (g_tp.fringe_cap * 1e6)); //F/m
+
+  if (call_from_outside)
+  {
+          wire_width       *= 1e6;
+          wire_spacing     *= 1e6;
+  }
+  return (tot_cap*len); // (F)
+}
+
+
+  double
+Wire::wire_res (double len /*(in m)*/)
+{
+
+          double aspect_ratio,alpha_scatter =1.05, dishing_thickness=0, barrier_thickness=0;
+          //TODO: this should be consistent with the wire_res in technology file
+          //The whole computation should be consistent with the wire_res in technology.cc too!
+
+          switch (wire_placement)
+          {
+          case outside_mat:
+          {
+                  aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
+                  break;
+          }
+          case inside_mat :
+          {
+                  aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
+                  break;
+          }
+          default:
+          {
+                  aspect_ratio = g_tp.wire_local.aspect_ratio;
+                  break;
+          }
+          }
+          return (alpha_scatter * resistivity * 1e-6 * len/((aspect_ratio*wire_width/w_scale-dishing_thickness - barrier_thickness)*
+                          (wire_width-2*barrier_thickness)));
+}
+
+/*
+ * Calculates the delay, power and area of the transmitter circuit.
+ *
+ * The transmitter delay is the sum of nand gate delay, inverter delay
+ * low swing nmos delay, and the wire delay
+ * (ref: Technical report 6)
+ */
+  void
+Wire::low_swing_model()
+{
+  double len = wire_length;
+  double beta = pmos_to_nmos_sz_ratio();
+
+
+  double inputrise = (in_rise_time == 0) ? signal_rise_time() : in_rise_time;
+
+  /* Final nmos low swing driver size calculation:
+   * Try to size the driver such that the delay
+   * is less than 8FO4.
+   * If the driver size is greater than
+   * the max allowable size, assume max size for the driver.
+   * In either case, recalculate the delay using
+   * the final driver size assuming slow input with
+   * finite rise time instead of ideal step input
+   *
+   * (ref: Technical report 6)
+   */
+  double cwire = wire_cap(len); /* load capacitance */
+  double rwire = wire_res(len);
+
+#define RES_ADJ (8.6) // Increase in resistance due to low driving vol.
+
+  double driver_res = (-8*g_tp.FO4/(log(0.5) * cwire))/RES_ADJ;
+  double nsize = R_to_w(driver_res, NCH);
+
+  nsize = MIN(nsize, g_tp.max_w_nmos_);
+  nsize = MAX(nsize, g_tp.min_w_nmos_);
+
+  if(rwire*cwire > 8*g_tp.FO4)
+  {
+    nsize = g_tp.max_w_nmos_;
+  }
+
+  // size the inverter appropriately to minimize the transmitter delay
+  // Note - In order to minimize leakage, we are not adding a set of inverters to
+  // bring down delay. Instead, we are sizing the single gate
+  // based on the logical effort.
+  double st_eff   = sqrt((2+beta/1+beta)*gate_C(nsize, 0)/(gate_C(2*g_tp.min_w_nmos_, 0)
+        + gate_C(2*min_w_pmos, 0)));
+  double req_cin  = ((2+beta/1+beta)*gate_C(nsize, 0))/st_eff;
+  double inv_size = req_cin/(gate_C(min_w_pmos, 0) + gate_C(g_tp.min_w_nmos_, 0));
+  inv_size = MAX(inv_size, 1);
+
+  /* nand gate delay */
+  double res_eq = (2 * tr_R_on(g_tp.min_w_nmos_, NCH, 1));
+  double cap_eq = 2 * drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(inv_size*g_tp.min_w_nmos_, 0) +
+    gate_C(inv_size*min_w_pmos, 0);
+
+  double timeconst = res_eq * cap_eq;
+
+  delay = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
+      deviceType->Vth/deviceType->Vdd, RISE);
+  double temp_power = cap_eq*deviceType->Vdd*deviceType->Vdd;
+
+  inputrise = delay / (deviceType->Vdd - deviceType->Vth); /* for the next stage */
+
+  /* Inverter delay:
+   * The load capacitance of this inv depends on
+   * the gate capacitance of the final stage nmos
+   * transistor which in turn depends on nsize
+   */
+  res_eq = tr_R_on(inv_size*min_w_pmos, PCH, 1);
+  cap_eq = drain_C_(inv_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(inv_size*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(nsize, 0);
+  timeconst = res_eq * cap_eq;
+
+  delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
+      deviceType->Vth/deviceType->Vdd, FALL);
+  temp_power += cap_eq*deviceType->Vdd*deviceType->Vdd;
+
+
+  transmitter.delay = delay;
+  transmitter.power.readOp.dynamic = temp_power*2; /* since it is a diff. model*/
+  transmitter.power.readOp.leakage = deviceType->Vdd *
+    (4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
+     4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
+
+  transmitter.power.readOp.gate_leakage = deviceType->Vdd *
+    (4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
+     4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
+
+  inputrise = delay / deviceType->Vth;
+
+  /* nmos delay + wire delay */
+  cap_eq = cwire + drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2 +
+    nsense * sense_amp_input_cap(); //+receiver cap
+  /*
+   * NOTE: nmos is used as both pull up and pull down transistor
+   * in the transmitter. This is because for low voltage swing, drive
+   * resistance of nmos is less than pmos
+   * (for a detailed graph ref: On-Chip Wires: Scaling and Efficiency)
+   */
+  timeconst = (tr_R_on(nsize, NCH, 1)*RES_ADJ) * (cwire +
+      drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2) +
+    rwire*cwire/2 +
+    (tr_R_on(nsize, NCH, 1)*RES_ADJ + rwire) *
+    nsense * sense_amp_input_cap();
+
+  /*
+   * since we are pre-equalizing and overdriving the low
+   * swing wires, the net time constant is less
+   * than the actual value
+   */
+  delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, .25, 0);
+#define VOL_SWING .1
+  temp_power += cap_eq*VOL_SWING*.400; /* .4v is the over drive voltage */
+  temp_power *= 2; /* differential wire */
+
+  l_wire.delay = delay - transmitter.delay;
+  l_wire.power.readOp.dynamic = temp_power - transmitter.power.readOp.dynamic;
+  l_wire.power.readOp.leakage = deviceType->Vdd*
+    (4* cmos_Isub_leakage(nsize, 0, 1, nmos));
+
+  l_wire.power.readOp.gate_leakage = deviceType->Vdd*
+    (4* cmos_Ig_leakage(nsize, 0, 1, nmos));
+
+  //double rt = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
+  //    deviceType->Vth/deviceType->Vdd, RISE)/deviceType->Vth;
+
+  delay += g_tp.sense_delay;
+
+  sense_amp.delay = g_tp.sense_delay;
+  out_rise_time = g_tp.sense_delay/(deviceType->Vth);
+  sense_amp.power.readOp.dynamic = g_tp.sense_dy_power;
+  sense_amp.power.readOp.leakage = 0; //FIXME
+  sense_amp.power.readOp.gate_leakage = 0;
+
+  power.readOp.dynamic = temp_power + sense_amp.power.readOp.dynamic;
+  power.readOp.leakage = transmitter.power.readOp.leakage +
+                         l_wire.power.readOp.leakage +
+                         sense_amp.power.readOp.leakage;
+  power.readOp.gate_leakage = transmitter.power.readOp.gate_leakage +
+                         l_wire.power.readOp.gate_leakage +
+                         sense_amp.power.readOp.gate_leakage;
+}
+
+  double
+Wire::sense_amp_input_cap()
+{
+  return drain_C_(g_tp.w_iso, PCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(g_tp.w_sense_en + g_tp.w_sense_n, 0) +
+    drain_C_(g_tp.w_sense_n, NCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(g_tp.w_sense_p, PCH, 1, 1, g_tp.cell_h_def);
+}
+
+
+void Wire::delay_optimal_wire ()
+{
+  double len       = wire_length;
+  //double min_wire_width = wire_width; //m
+  double beta = pmos_to_nmos_sz_ratio();
+  double switching = 0;  // switching energy
+  double short_ckt = 0;  // short-circuit energy
+  double tc        = 0;  // time constant
+  // input cap of min sized driver
+  double input_cap = gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0);
+
+   // output parasitic capacitance of
+   // the min. sized driver
+  double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
+  // drive resistance
+  double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
+      tr_R_on(min_w_pmos, PCH, 1))/2;
+  double wr = wire_res(len); //ohm
+
+  // wire cap /m
+  double wc = wire_cap(len);
+
+  // size the repeater such that the delay of the wire is minimum
+  double repeater_scaling = sqrt(out_res*wc/(wr*input_cap)); // len will cancel
+
+   // calc the optimum spacing between the repeaters (m)
+
+  repeater_spacing = sqrt(2 * out_res * (out_cap + input_cap)/
+      ((wr/len)*(wc/len)));
+  repeater_size = repeater_scaling;
+
+  switching = (repeater_scaling * (input_cap + out_cap) +
+      repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
+
+  tc = out_res * (input_cap + out_cap) +
+    out_res * wc/len * repeater_spacing/repeater_scaling +
+    wr/len * repeater_spacing * input_cap * repeater_scaling +
+    0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
+
+  delay = 0.693 * tc * len/repeater_spacing;
+
+#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
+  short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
+    repeater_scaling * tc;
+
+  area.set_area((len/repeater_spacing) *
+                compute_gate_area(INV, 1, min_w_pmos * repeater_scaling,
+                                          g_tp.min_w_nmos_ * repeater_scaling, g_tp.cell_h_def));
+  power.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
+  power.readOp.leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
+  power.readOp.gate_leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
+}
+
+
+
+// calculate power/delay values for wires with suboptimal repeater sizing/spacing
+void
+Wire::init_wire(){
+  wire_length = 1;
+  delay_optimal_wire();
+    double sp, si;
+  powerDef pow;
+  si = repeater_size;
+  sp = repeater_spacing;
+  sp *= 1e6; // in microns
+
+  double i, j, del;
+  repeated_wire.push_back(Component());
+  for (j=sp; j < 4*sp; j+=100) {
+    for (i = si; i > 1; i--) {
+      pow = wire_model(j*1e-6, i, &del);
+      if (j == sp && i == si) {
+        global.delay = del;
+        global.power = pow;
+        global.area.h = si;
+        global.area.w = sp*1e-6; // m
+      }
+//      cout << "Repeater size - "<< i <<
+//        " Repeater spacing - " << j <<
+//        " Delay - " << del <<
+//        " PowerD - " << pow.readOp.dynamic <<
+//        " PowerL - " << pow.readOp.leakage <<endl;
+      repeated_wire.back().delay = del;
+      repeated_wire.back().power.readOp = pow.readOp;
+      repeated_wire.back().area.w = j*1e-6; //m
+      repeated_wire.back().area.h = i;
+      repeated_wire.push_back(Component());
+
+    }
+  }
+  repeated_wire.pop_back();
+  update_fullswing();
+  Wire *l_wire = new Wire(Low_swing, 0.001/* 1 mm*/, 1);
+  low_swing.delay = l_wire->delay;
+  low_swing.power = l_wire->power;
+  delete l_wire;
+}
+
+
+
+void Wire::update_fullswing()
+{
+
+  list<Component>::iterator citer;
+  double del[4];
+  del[3] = this->global.delay + this->global.delay*.3;
+  del[2] = global.delay + global.delay*.2;
+  del[1] = global.delay + global.delay*.1;
+  del[0] = global.delay + global.delay*.05;
+  double threshold;
+  double ncost;
+  double cost;
+  int i = 4;
+  while (i>0) {
+    threshold = del[i-1];
+    cost = BIGNUM;
+    for (citer = repeated_wire.begin(); citer != repeated_wire.end(); citer++)
+    {
+      if (citer->delay > threshold) {
+        citer = repeated_wire.erase(citer);
+        citer --;
+      }
+      else {
+        ncost = citer->power.readOp.dynamic/global.power.readOp.dynamic +
+                citer->power.readOp.leakage/global.power.readOp.leakage;
+        if(ncost < cost)
+        {
+          cost = ncost;
+          if (i == 4) {
+            global_30.delay = citer->delay;
+            global_30.power = citer->power;
+            global_30.area  = citer->area;
+          }
+          else if (i==3) {
+            global_20.delay = citer->delay;
+            global_20.power = citer->power;
+            global_20.area  = citer->area;
+          }
+          else if(i==2) {
+            global_10.delay = citer->delay;
+            global_10.power = citer->power;
+            global_10.area  = citer->area;
+          }
+          else if(i==1) {
+            global_5.delay = citer->delay;
+            global_5.power = citer->power;
+            global_5.area  = citer->area;
+          }
+        }
+      }
+    }
+    i--;
+  }
+}
+
+
+
+powerDef Wire::wire_model (double space, double size, double *delay)
+{
+  powerDef ptemp;
+  double len = 1;
+  //double min_wire_width = wire_width; //m
+  double beta = pmos_to_nmos_sz_ratio();
+  // switching energy
+  double switching = 0;
+  // short-circuit energy
+  double short_ckt = 0;
+  // time constant
+  double tc = 0;
+  // input cap of min sized driver
+  double input_cap = gate_C (g_tp.min_w_nmos_ +
+      min_w_pmos, 0);
+
+   // output parasitic capacitance of
+   // the min. sized driver
+  double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
+  // drive resistance
+  double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
+      tr_R_on(min_w_pmos, PCH, 1))/2;
+  double wr = wire_res(len); //ohm
+
+  // wire cap /m
+  double wc = wire_cap(len);
+
+  repeater_spacing = space;
+  repeater_size = size;
+
+  switching = (repeater_size * (input_cap + out_cap) +
+      repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
+
+  tc = out_res * (input_cap + out_cap) +
+    out_res * wc/len * repeater_spacing/repeater_size +
+    wr/len * repeater_spacing * out_cap * repeater_size +
+    0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
+
+  *delay = 0.693 * tc * len/repeater_spacing;
+
+#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
+  short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
+    repeater_size * tc;
+
+  ptemp.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
+  ptemp.readOp.leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
+
+  ptemp.readOp.gate_leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
+
+  return ptemp;
+}
+
+void
+Wire::print_wire()
+{
+
+  cout << "\nWire Properties:\n\n";
+  cout << "  Delay Optimal\n\tRepeater size - "<< global.area.h <<
+    " \n\tRepeater spacing - " << global.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global.delay*1e6 <<  " (ns/mm)"
+    " \n\tPowerD - " << global.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+
+  cout << "  5% Overhead\n\tRepeater size - "<< global_5.area.h <<
+    " \n\tRepeater spacing - " << global_5.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_5.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_5.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_5.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_5.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  10% Overhead\n\tRepeater size - "<< global_10.area.h <<
+    " \n\tRepeater spacing - " << global_10.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_10.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_10.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_10.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_10.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  20% Overhead\n\tRepeater size - "<< global_20.area.h <<
+    " \n\tRepeater spacing - " << global_20.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_20.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_20.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_20.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_20.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  30% Overhead\n\tRepeater size - "<< global_30.area.h <<
+    " \n\tRepeater spacing - " << global_30.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_30.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_30.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_30.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_30.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  Low-swing wire (1 mm) - Note: Unlike repeated wires, \n\tdelay and power "
+            "values of low-swing wires do not\n\thave a linear relationship with length." <<
+      " \n\tdelay - " << low_swing.delay *1e9<<  " (ns)"
+      " \n\tpowerD - " << low_swing.power.readOp.dynamic *1e9<< " (nJ)"
+      " \n\tPowerL - " << low_swing.power.readOp.leakage << " (mW)"
+      " \n\tPowerLgate - " << low_swing.power.readOp.gate_leakage << " (mW)\n";
+  cout << "\tWire width - " <<wire_width_init * 2 /* differential */<< " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init * 2 /* differential */<< " microns\n";
+  cout <<endl;
+  cout <<endl;
+
+}
+
diff --git a/ext/mcpat/cacti/wire.h b/ext/mcpat/cacti/wire.h
new file mode 100644 (file)
index 0000000..51d55af
--- /dev/null
@@ -0,0 +1,124 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __WIRE_H__
+#define __WIRE_H__
+
+#include <iostream>
+#include <list>
+
+#include "assert.h"
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "parameter.h"
+
+class Wire : public Component
+{
+  public:
+    Wire(enum Wire_type wire_model, double len /* in u*/,
+         int nsense = 1/* no. of sense amps connected to the low-swing wire */,
+         double width_scaling = 1,
+         double spacing_scaling = 1,
+         enum Wire_placement wire_placement = outside_mat,
+         double resistivity = CU_RESISTIVITY,
+         TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Wire();
+
+    Wire( double width_scaling = 1,
+         double spacing_scaling = 1,
+         enum Wire_placement wire_placement = outside_mat,
+         double resistivity = CU_RESISTIVITY,
+         TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
+    ); // should be used only once for initializing static members
+    void init_wire();
+
+    void calculate_wire_stats();
+    void delay_optimal_wire();
+    double wire_cap(double len, bool call_from_outside=false);
+    double wire_res(double len);
+    void low_swing_model();
+    double signal_fall_time();
+    double signal_rise_time();
+    double sense_amp_input_cap();
+
+    enum Wire_type wt;
+    double wire_spacing;
+    double wire_width;
+    enum Wire_placement wire_placement;
+    double repeater_size;
+    double repeater_spacing;
+    double wire_length;
+    double in_rise_time, out_rise_time;
+
+    void set_in_rise_time(double rt)
+    {
+      in_rise_time = rt;
+    }
+    static Component global;
+    static Component global_5;
+    static Component global_10;
+    static Component global_20;
+    static Component global_30;
+    static Component low_swing;
+    static double wire_width_init;
+    static double wire_spacing_init;
+    void print_wire();
+
+  private:
+
+    int nsense; // no. of sense amps connected to a low-swing wire if it
+                // is broadcasting data to multiple destinations
+    // width and spacing scaling factor can be used
+    // to model low level wires or special
+    // fat wires
+    double w_scale, s_scale;
+    double resistivity;
+    powerDef wire_model (double space, double size, double *delay);
+    list <Component> repeated_wire;
+    void update_fullswing();
+    static int initialized;
+
+
+    //low-swing
+    Component transmitter;
+    Component l_wire;
+    Component sense_amp;
+
+    double min_w_pmos;
+
+    TechnologyParameter::DeviceType *deviceType;
+
+};
+
+#endif
diff --git a/ext/mcpat/core.cc b/ext/mcpat/core.cc
new file mode 100644 (file)
index 0000000..ba91060
--- /dev/null
@@ -0,0 +1,4135 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "XML_Parse.h"
+#include "basic_circuit.h"
+#include "const.h"
+#include "core.h"
+#include "io.h"
+#include "parameter.h"
+//#include "globalvar.h"
+
+InstFetchU::InstFetchU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ IB  (0),
+ BTB (0),
+ ID_inst  (0),
+ ID_operand  (0),
+ ID_misc  (0),
+ exist(exist_)
+{
+          if (!exist) return;
+          int  idx, tag, data, size, line, assoc, banks;
+          bool debug= false, is_default = true;
+
+          clockRate = coredynp.clockRate;
+          executionTime = coredynp.executionTime;
+          cache_p = (Cache_policy)XML->sys.core[ithCore].icache.icache_config[7];
+          //Assuming all L1 caches are virtually idxed physically tagged.
+          //cache
+
+          size                             = (int)XML->sys.core[ithCore].icache.icache_config[0];
+          line                             = (int)XML->sys.core[ithCore].icache.icache_config[1];
+          assoc                            = (int)XML->sys.core[ithCore].icache.icache_config[2];
+          banks                            = (int)XML->sys.core[ithCore].icache.icache_config[3];
+          idx                                                     = debug?9:int(ceil(log2(size/line/assoc)));
+          tag                                                     = debug?51:(int)XML->sys.physical_address_width-idx-int(ceil(log2(line))) + EXTRA_TAG_BITS;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.cache_sz            = debug?32768:(int)XML->sys.core[ithCore].icache.icache_config[0];
+          interface_ip.line_sz             = debug?64:(int)XML->sys.core[ithCore].icache.icache_config[1];
+          interface_ip.assoc               = debug?8:(int)XML->sys.core[ithCore].icache.icache_config[2];
+          interface_ip.nbanks              = debug?1:(int)XML->sys.core[ithCore].icache.icache_config[3];
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5];
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate;
+          interface_ip.latency             = debug?3.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate;
+          interface_ip.is_cache                         = true;
+          interface_ip.pure_cam                         = false;
+          interface_ip.pure_ram                         = false;
+        //  interface_ip.obj_func_dyn_energy = 0;
+        //  interface_ip.obj_func_dyn_power  = 0;
+        //  interface_ip.obj_func_leak_power = 0;
+        //  interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          icache.caches = new ArrayST(&interface_ip, "icache", Core_device, coredynp.opt_local, coredynp.core_ty);
+          scktRatio = g_tp.sckt_co_eff;
+          chip_PR_overhead = g_tp.chip_layout_overhead;
+          macro_PR_overhead = g_tp.macro_layout_overhead;
+          icache.area.set_area(icache.area.get_area()+ icache.caches->local_result.area);
+          area.set_area(area.get_area()+ icache.caches->local_result.area);
+          //output_data_csv(icache.caches.local_result);
+
+
+          /*
+           *iCache controllers
+           *miss buffer Each MSHR contains enough state
+           *to handle one or more accesses of any type to a single memory line.
+           *Due to the generality of the MSHR mechanism,
+           *the amount of state involved is non-trivial:
+           *including the address, pointers to the cache entry and destination register,
+           *written data, and various other pieces of state.
+           */
+          interface_ip.num_search_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + icache.caches->l_ip.line_sz*8;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].icache.buffer_sizes[0]*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate;//means cycle time
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate;//means access time
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          interface_ip.num_search_ports = XML->sys.core[ithCore].number_instruction_fetch_ports;
+          icache.missb = new ArrayST(&interface_ip, "icacheMissBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          icache.area.set_area(icache.area.get_area()+ icache.missb->local_result.area);
+          area.set_area(area.get_area()+ icache.missb->local_result.area);
+          //output_data_csv(icache.missb.local_result);
+
+          //fill buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = icache.caches->l_ip.line_sz;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+          interface_ip.cache_sz            = data*XML->sys.core[ithCore].icache.buffer_sizes[1];
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          interface_ip.num_search_ports = XML->sys.core[ithCore].number_instruction_fetch_ports;
+          icache.ifb = new ArrayST(&interface_ip, "icacheFillBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          icache.area.set_area(icache.area.get_area()+ icache.ifb->local_result.area);
+          area.set_area(area.get_area()+ icache.ifb->local_result.area);
+          //output_data_csv(icache.ifb.local_result);
+
+          //prefetch buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge.
+          data                                                    = icache.caches->l_ip.line_sz;//separate queue to prevent from cache polution.
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].icache.buffer_sizes[2]*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          interface_ip.num_search_ports = XML->sys.core[ithCore].number_instruction_fetch_ports;
+          icache.prefetchb = new ArrayST(&interface_ip, "icacheprefetchBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          icache.area.set_area(icache.area.get_area()+ icache.prefetchb->local_result.area);
+          area.set_area(area.get_area()+ icache.prefetchb->local_result.area);
+          //output_data_csv(icache.prefetchb.local_result);
+
+          //Instruction buffer
+          data                                                    = XML->sys.core[ithCore].instruction_length*XML->sys.core[ithCore].peak_issue_width;//icache.caches.l_ip.line_sz; //multiple threads timing sharing the instruction buffer.
+          interface_ip.is_cache                           = false;
+          interface_ip.pure_ram            = true;
+          interface_ip.pure_cam            = false;
+          interface_ip.line_sz             = int(ceil(data/8.0));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].number_hardware_threads*XML->sys.core[ithCore].instruction_buffer_size*interface_ip.line_sz>64?
+                                                     XML->sys.core[ithCore].number_hardware_threads*XML->sys.core[ithCore].instruction_buffer_size*interface_ip.line_sz:64;
+          interface_ip.assoc               = 1;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = 1.0/clockRate;
+          interface_ip.latency             = 1.0/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          //NOTE: Assuming IB is time slice shared among threads, every fetch op will at least fetch "fetch width" instructions.
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;//XML->sys.core[ithCore].fetch_width;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          IB = new ArrayST(&interface_ip, "InstBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          IB->area.set_area(IB->area.get_area()+ IB->local_result.area);
+          area.set_area(area.get_area()+ IB->local_result.area);
+          //output_data_csv(IB.IB.local_result);
+
+          //     inst_decoder.opcode_length = XML->sys.core[ithCore].opcode_width;
+          //     inst_decoder.init_decoder(is_default, &interface_ip);
+          //     inst_decoder.full_decoder_power();
+
+      if (coredynp.predictionW>0)
+      {
+          /*
+           * BTB branch target buffer, accessed during IF stage. Virtually indexed and virtually tagged
+           * It is only a cache without all the buffers in the cache controller since it is more like a
+           * look up table than a cache with cache controller. When access miss, no load from other places
+           * such as main memory (not actively fill the misses), it is passively updated under two circumstances:
+           * 1)  when BPT@ID stage finds out current is a taken branch while BTB missed
+           * 2)  When BPT@ID stage predicts differently than BTB
+           * 3)  When ID stage finds out current instruction is not a branch while BTB had a hit.(mark as invalid)
+           * 4)  when EXEU find out wrong target has been provided from BTB.
+           *
+           */
+          size                             = XML->sys.core[ithCore].BTB.BTB_config[0];
+          line                             = XML->sys.core[ithCore].BTB.BTB_config[1];
+          assoc                            = XML->sys.core[ithCore].BTB.BTB_config[2];
+          banks                            = XML->sys.core[ithCore].BTB.BTB_config[3];
+          idx                                                     = debug?9:int(ceil(log2(size/line/assoc)));
+//       tag                                                      = debug?51:XML->sys.virtual_address_width-idx-int(ceil(log2(line))) + int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +EXTRA_TAG_BITS;
+          tag                                                     = debug?51:XML->sys.virtual_address_width + int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +EXTRA_TAG_BITS;
+          interface_ip.is_cache                           = true;
+          interface_ip.pure_ram            = false;
+          interface_ip.pure_cam            = false;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.cache_sz            = debug?32768:size;
+          interface_ip.line_sz             = debug?64:line;
+          interface_ip.assoc               = debug?8:assoc;
+          interface_ip.nbanks              = debug?1:banks;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;//debug?0:XML->sys.core[ithCore].dcache.dcache_config[5];
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].BTB.BTB_config[4]/clockRate;
+          interface_ip.latency             = debug?3.0/clockRate:XML->sys.core[ithCore].BTB.BTB_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = coredynp.predictionW;
+          interface_ip.num_wr_ports    = coredynp.predictionW;
+          interface_ip.num_se_rd_ports = 0;
+          BTB = new ArrayST(&interface_ip, "Branch Target Buffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          BTB->area.set_area(BTB->area.get_area()+ BTB->local_result.area);
+          area.set_area(area.get_area()+ BTB->local_result.area);
+          ///cout<<"area="<<area<<endl;
+
+          BPT = new BranchPredictor(XML, ithCore, &interface_ip,coredynp);
+          area.set_area(area.get_area()+ BPT->area.get_area());
+      }
+
+      ID_inst = new inst_decoder(is_default, &interface_ip,
+                  coredynp.opcode_length, 1/*Decoder should not know how many by itself*/,
+                  coredynp.x86,
+                  Core_device, coredynp.core_ty);
+
+      ID_operand = new inst_decoder(is_default, &interface_ip,
+                  coredynp.arch_ireg_width, 1,
+                  coredynp.x86,
+                  Core_device, coredynp.core_ty);
+
+      ID_misc = new inst_decoder(is_default, &interface_ip,
+                  8/* Prefix field etc upto 14B*/, 1,
+                  coredynp.x86,
+                  Core_device, coredynp.core_ty);
+      //TODO: X86 decoder should decode the inst in cyclic mode under the control of squencer.
+      //So the dynamic power should be multiplied by a few times.
+      area.set_area(area.get_area()+ (ID_inst->area.get_area()
+                  +ID_operand->area.get_area()
+                  +ID_misc->area.get_area())*coredynp.decodeW);
+
+}
+
+
+BranchPredictor::BranchPredictor(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ globalBPT(0),
+ localBPT(0),
+ L1_localBPT(0),
+ L2_localBPT(0),
+ chooser(0),
+ RAS(0),
+ exist(exist_)
+{
+        /*
+         * Branch Predictor, accessed during ID stage.
+         * McPAT's branch predictor model is the tournament branch predictor used in Alpha 21264,
+         * including global predictor, local two level predictor, and Chooser.
+         * The Branch predictor also includes a RAS (return address stack) for function calls
+         * Branch predictors are tagged by thread ID and modeled as 1-way associative $
+         * However RAS return address stacks are duplicated for each thread.
+         * TODO:Data Width need to be computed more precisely   *
+         */
+        if (!exist) return;
+        int  tag, data;
+
+        clockRate = coredynp.clockRate;
+        executionTime = coredynp.executionTime;
+        interface_ip.assoc               = 1;
+        interface_ip.pure_cam            = false;
+        if (coredynp.multithreaded)
+        {
+
+                tag                                                         = int(log2(coredynp.num_hthreads)+ EXTRA_TAG_BITS);
+                interface_ip.specific_tag        = 1;
+                interface_ip.tag_w               = tag;
+
+                interface_ip.is_cache                   = true;
+                interface_ip.pure_ram            = false;
+                }
+        else
+        {
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_ram            = true;
+
+        }
+        //Global predictor
+        data                                                    = int(ceil(XML->sys.core[ithCore].predictor.global_predictor_bits/8.0));
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].predictor.global_predictor_entries;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 2;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 0;
+        interface_ip.num_rd_ports    = coredynp.predictionW;
+        interface_ip.num_wr_ports    = coredynp.predictionW;
+        interface_ip.num_se_rd_ports = 0;
+        globalBPT = new ArrayST(&interface_ip, "Global Predictor", Core_device, coredynp.opt_local, coredynp.core_ty);
+        globalBPT->area.set_area(globalBPT->area.get_area()+ globalBPT->local_result.area);
+        area.set_area(area.get_area()+ globalBPT->local_result.area);
+
+        //Local BPT (Level 1)
+        data                                                    = int(ceil(XML->sys.core[ithCore].predictor.local_predictor_size[0]/8.0));
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].predictor.local_predictor_entries;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 2;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 0;
+        interface_ip.num_rd_ports    = coredynp.predictionW;
+        interface_ip.num_wr_ports    = coredynp.predictionW;
+        interface_ip.num_se_rd_ports = 0;
+        L1_localBPT = new ArrayST(&interface_ip, "L1 local Predictor", Core_device, coredynp.opt_local, coredynp.core_ty);
+        L1_localBPT->area.set_area(L1_localBPT->area.get_area()+ L1_localBPT->local_result.area);
+        area.set_area(area.get_area()+ L1_localBPT->local_result.area);
+
+        //Local BPT (Level 2)
+        data                                                    = int(ceil(XML->sys.core[ithCore].predictor.local_predictor_size[1]/8.0));
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].predictor.local_predictor_entries;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 2;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 0;
+        interface_ip.num_rd_ports    = coredynp.predictionW;
+        interface_ip.num_wr_ports    = coredynp.predictionW;
+        interface_ip.num_se_rd_ports = 0;
+        L2_localBPT = new ArrayST(&interface_ip, "L2 local Predictor", Core_device, coredynp.opt_local, coredynp.core_ty);
+        L2_localBPT->area.set_area(L2_localBPT->area.get_area()+ L2_localBPT->local_result.area);
+        area.set_area(area.get_area()+ L2_localBPT->local_result.area);
+
+        //Chooser
+        data                                                    = int(ceil(XML->sys.core[ithCore].predictor.chooser_predictor_bits/8.0));
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].predictor.chooser_predictor_entries;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 2;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 0;
+        interface_ip.num_rd_ports    = coredynp.predictionW;
+        interface_ip.num_wr_ports    = coredynp.predictionW;
+        interface_ip.num_se_rd_ports = 0;
+        chooser = new ArrayST(&interface_ip, "Predictor Chooser", Core_device, coredynp.opt_local, coredynp.core_ty);
+        chooser->area.set_area(chooser->area.get_area()+ chooser->local_result.area);
+        area.set_area(area.get_area()+ chooser->local_result.area);
+
+        //RAS return address stacks are Duplicated for each thread.
+        interface_ip.is_cache                   = false;
+        interface_ip.pure_ram            = true;
+        data                                                    = int(ceil(coredynp.pc_width/8.0));
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].RAS_size;
+        interface_ip.assoc               = 1;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 2;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 0;
+        interface_ip.num_rd_ports    = coredynp.predictionW;
+        interface_ip.num_wr_ports    = coredynp.predictionW;
+        interface_ip.num_se_rd_ports = 0;
+        RAS = new ArrayST(&interface_ip, "RAS", Core_device, coredynp.opt_local, coredynp.core_ty);
+        RAS->area.set_area(RAS->area.get_area()+ RAS->local_result.area*coredynp.num_hthreads);
+        area.set_area(area.get_area()+ RAS->local_result.area*coredynp.num_hthreads);
+
+}
+
+SchedulerU::SchedulerU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ int_inst_window(0),
+ fp_inst_window(0),
+ ROB(0),
+ instruction_selection(0),
+ exist(exist_)
+ {
+        if (!exist) return;
+        int   tag, data;
+        bool  is_default=true;
+        string tmp_name;
+
+        clockRate = coredynp.clockRate;
+        executionTime = coredynp.executionTime;
+        if ((coredynp.core_ty==Inorder && coredynp.multithreaded))
+        {
+                //Instruction issue queue, in-order multi-issue or multithreaded processor also has this structure. Unified window for Inorder processors
+                tag                                                         = int(log2(XML->sys.core[ithCore].number_hardware_threads)*coredynp.perThreadState);//This is the normal thread state bits based on Niagara Design
+                data                                                    = XML->sys.core[ithCore].instruction_length;
+                //NOTE: x86 inst can be very lengthy, up to 15B. Source: Intel® 64 and IA-32 Architectures
+                //Software Developer’s Manual
+                interface_ip.is_cache                   = true;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = false;
+                interface_ip.line_sz             = int(ceil(data/8.0));
+                interface_ip.specific_tag        = 1;
+                interface_ip.tag_w               = tag;
+                interface_ip.cache_sz            = XML->sys.core[ithCore].instruction_window_size*interface_ip.line_sz>64?XML->sys.core[ithCore].instruction_window_size*interface_ip.line_sz:64;
+                interface_ip.assoc               = 0;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 1.0/clockRate;
+                interface_ip.latency             = 1.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports        = 0;
+                interface_ip.num_rd_ports        = coredynp.peak_issueW;
+                interface_ip.num_wr_ports        = coredynp.peak_issueW;
+                interface_ip.num_se_rd_ports     = 0;
+                interface_ip.num_search_ports    = coredynp.peak_issueW;
+                int_inst_window = new ArrayST(&interface_ip, "InstFetchQueue", Core_device, coredynp.opt_local, coredynp.core_ty);
+                int_inst_window->area.set_area(int_inst_window->area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines);
+                area.set_area(area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines);
+                //output_data_csv(iRS.RS.local_result);
+                Iw_height      =int_inst_window->local_result.cache_ht;
+
+                /*
+                 * selection logic
+                 * In a single-issue Inorder multithreaded processor like Niagara, issue width=1*number_of_threads since the processor does need to pick up
+                 * instructions from multiple ready ones(although these ready ones are from different threads).While SMT processors do not distinguish which thread belongs to who
+                 * at the issue stage.
+                 */
+
+                instruction_selection = new selection_logic(is_default, XML->sys.core[ithCore].instruction_window_size,
+                                coredynp.peak_issueW*XML->sys.core[ithCore].number_hardware_threads,
+                                &interface_ip, Core_device, coredynp.core_ty);
+        }
+
+    if (coredynp.core_ty==OOO)
+    {
+        /*
+         * CAM based instruction window
+         * For physicalRegFilebased OOO it is the instruction issue queue, where only tags of phy regs are stored
+         * For RS based OOO it is the Reservation station, where both tags and values of phy regs are stored
+         * It is written once and read twice(two operands) before an instruction can be issued.
+         * X86 instruction can be very long up to 15B. add instruction length in XML
+         */
+        if(coredynp.scheu_ty==PhysicalRegFile)
+        {
+                tag     = coredynp.phy_ireg_width;
+                // Each time only half of the tag is compared, but two tag should be stored.
+                // This underestimate the search power
+                data = int((ceil((coredynp.instruction_length+2*(coredynp.phy_ireg_width - coredynp.arch_ireg_width))/2.0)/8.0));
+                //Data width being divided by 2 means only after both operands available the whole data will be read out.
+                //This is modeled using two equivalent readouts with half of the data width
+                tmp_name = "InstIssueQueue";
+        }
+        else
+        {
+                tag      = coredynp.phy_ireg_width;
+                // Each time only half of the tag is compared, but two tag should be stored.
+                // This underestimate the search power
+                data  = int(ceil(((coredynp.instruction_length+2*(coredynp.phy_ireg_width - coredynp.arch_ireg_width)+
+                                2*coredynp.int_data_width)/2.0)/8.0));
+                //Data width being divided by 2 means only after both operands available the whole data will be read out.
+                //This is modeled using two equivalent readouts with half of the data width
+
+                tmp_name = "IntReservationStation";
+        }
+        interface_ip.is_cache                   = true;
+        interface_ip.pure_cam            = false;
+        interface_ip.pure_ram            = false;
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].instruction_window_size;
+        interface_ip.assoc               = 0;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.specific_tag        = 1;
+        interface_ip.tag_w               = tag;
+        interface_ip.access_mode         = 0;
+        interface_ip.throughput          = 2*1.0/clockRate;
+        interface_ip.latency             = 2*1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports       = 0;
+        interface_ip.num_rd_ports       = coredynp.peak_issueW;
+        interface_ip.num_wr_ports       = coredynp.peak_issueW;
+        interface_ip.num_se_rd_ports    = 0;
+                interface_ip.num_search_ports   = coredynp.peak_issueW;
+                int_inst_window = new ArrayST(&interface_ip, tmp_name, Core_device, coredynp.opt_local, coredynp.core_ty);
+                int_inst_window->area.set_area(int_inst_window->area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines);
+                area.set_area(area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines);
+                Iw_height      =int_inst_window->local_result.cache_ht;
+                //FU inst window
+        if(coredynp.scheu_ty==PhysicalRegFile)
+        {
+                tag     = 2*coredynp.phy_freg_width;// TODO: each time only half of the tag is compared
+                data = int(ceil((coredynp.instruction_length+2*(coredynp.phy_freg_width - coredynp.arch_freg_width))/8.0));
+                tmp_name = "FPIssueQueue";
+        }
+        else
+        {
+                tag      = 2*coredynp.phy_ireg_width;
+                data  = int(ceil((coredynp.instruction_length+2*(coredynp.phy_freg_width - coredynp.arch_freg_width)+
+                                2*coredynp.fp_data_width)/8.0));
+                tmp_name = "FPReservationStation";
+        }
+        interface_ip.is_cache                   = true;
+        interface_ip.pure_cam            = false;
+        interface_ip.pure_ram            = false;
+        interface_ip.line_sz             = data;
+        interface_ip.cache_sz            = data*XML->sys.core[ithCore].fp_instruction_window_size;
+        interface_ip.assoc               = 0;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.specific_tag        = 1;
+        interface_ip.tag_w               = tag;
+        interface_ip.access_mode         = 0;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports       = 0;
+        interface_ip.num_rd_ports       = coredynp.fp_issueW;
+        interface_ip.num_wr_ports       = coredynp.fp_issueW;
+        interface_ip.num_se_rd_ports    = 0;
+                interface_ip.num_search_ports   = coredynp.fp_issueW;
+                fp_inst_window = new ArrayST(&interface_ip, tmp_name, Core_device, coredynp.opt_local, coredynp.core_ty);
+                fp_inst_window->area.set_area(fp_inst_window->area.get_area()+ fp_inst_window->local_result.area*coredynp.num_fp_pipelines);
+                area.set_area(area.get_area()+ fp_inst_window->local_result.area*coredynp.num_fp_pipelines);
+                fp_Iw_height      =fp_inst_window->local_result.cache_ht;
+
+                if (XML->sys.core[ithCore].ROB_size >0)
+                {
+                        /*
+                         *  if ROB_size = 0, then the target processor does not support hardware-based
+                         *  speculation, i.e. , the processor allow OOO issue as well as OOO completion, which
+                         *  means branch must be resolved before instruction issued into instruction window, since
+                         *  there is no change to flush miss-predict branch path after instructions are issued in this situation.
+                         *
+                         *  ROB.ROB size = inflight inst. ROB is unified for int and fp inst.
+                         *  One old approach is to combine the RAT and ROB as a huge CAM structure as in AMD K7.
+                         *  However, this approach is abandoned due to its high power and poor scalablility.
+                         *     McPAT uses current implementation of ROB as circular buffer.
+                         *     ROB is written once when instruction is issued and read once when the instruction is committed.         *
+                         */
+                        int robExtra = int(ceil(5 + log2(coredynp.num_hthreads)));
+                        //5 bits are: busy, Issued, Finished, speculative, valid
+                        if(coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                //PC is to id the instruction for recover exception.
+                                //inst is used to map the renamed dest. registers.so that commit stage can know which reg/RRAT to update
+//                             data = int(ceil((robExtra+coredynp.pc_width +
+//                                             coredynp.instruction_length + 2*coredynp.phy_ireg_width)/8.0));
+                                data = int(ceil((robExtra+coredynp.pc_width +
+                                                        coredynp.phy_ireg_width)/8.0));
+                        }
+                        else
+                        {
+                                //in RS based OOO, ROB also contains value of destination reg
+//                             data  = int(ceil((robExtra+coredynp.pc_width +
+//                                             coredynp.instruction_length + 2*coredynp.phy_ireg_width + coredynp.fp_data_width)/8.0));
+                                data  = int(ceil((robExtra + coredynp.pc_width +
+                                                coredynp.phy_ireg_width + coredynp.fp_data_width)/8.0));
+                        }
+                        interface_ip.is_cache                   = false;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = true;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].ROB_size;//The XML ROB size is for all threads
+                        interface_ip.assoc               = 1;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = interface_ip.line_sz*8;
+                        interface_ip.access_mode         = 1;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports       = 0;
+                        interface_ip.num_rd_ports       = coredynp.peak_commitW;
+                        interface_ip.num_wr_ports       = coredynp.peak_issueW;
+                        interface_ip.num_se_rd_ports    = 0;
+                        interface_ip.num_search_ports   = 0;
+                        ROB = new ArrayST(&interface_ip, "ReorderBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        ROB->area.set_area(ROB->area.get_area()+ ROB->local_result.area*coredynp.num_pipelines);
+                        area.set_area(area.get_area()+ ROB->local_result.area*coredynp.num_pipelines);
+                        ROB_height      =ROB->local_result.cache_ht;
+                }
+
+                instruction_selection = new selection_logic(is_default, XML->sys.core[ithCore].instruction_window_size,
+                                coredynp.peak_issueW, &interface_ip, Core_device, coredynp.core_ty);
+    }
+}
+
+LoadStoreU::LoadStoreU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ LSQ(0),
+ exist(exist_)
+{
+          if (!exist) return;
+          int  idx, tag, data, size, line, assoc, banks;
+          bool debug= false;
+          int ldst_opcode = XML->sys.core[ithCore].opcode_width;//16;
+
+          clockRate = coredynp.clockRate;
+          executionTime = coredynp.executionTime;
+          cache_p = (Cache_policy)XML->sys.core[ithCore].dcache.dcache_config[7];
+
+          interface_ip.num_search_ports    = XML->sys.core[ithCore].memory_ports;
+          interface_ip.is_cache                           = true;
+          interface_ip.pure_cam            = false;
+          interface_ip.pure_ram            = false;
+          //Dcache
+          size                             = (int)XML->sys.core[ithCore].dcache.dcache_config[0];
+          line                             = (int)XML->sys.core[ithCore].dcache.dcache_config[1];
+          assoc                            = (int)XML->sys.core[ithCore].dcache.dcache_config[2];
+          banks                            = (int)XML->sys.core[ithCore].dcache.dcache_config[3];
+          idx                                                     = debug?9:int(ceil(log2(size/line/assoc)));
+          tag                                                     = debug?51:XML->sys.physical_address_width-idx-int(ceil(log2(line))) + EXTRA_TAG_BITS;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.cache_sz            = debug?32768:(int)XML->sys.core[ithCore].dcache.dcache_config[0];
+          interface_ip.line_sz             = debug?64:(int)XML->sys.core[ithCore].dcache.dcache_config[1];
+          interface_ip.assoc               = debug?8:(int)XML->sys.core[ithCore].dcache.dcache_config[2];
+          interface_ip.nbanks              = debug?1:(int)XML->sys.core[ithCore].dcache.dcache_config[3];
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;//debug?0:XML->sys.core[ithCore].dcache.dcache_config[5];
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate;
+          interface_ip.latency             = debug?3.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate;
+          interface_ip.is_cache                         = true;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].memory_ports;//usually In-order has 1 and OOO has 2 at least.
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          dcache.caches = new ArrayST(&interface_ip, "dcache", Core_device, coredynp.opt_local, coredynp.core_ty);
+          dcache.area.set_area(dcache.area.get_area()+ dcache.caches->local_result.area);
+          area.set_area(area.get_area()+ dcache.caches->local_result.area);
+          //output_data_csv(dcache.caches.local_result);
+
+          //dCache controllers
+          //miss buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + dcache.caches->l_ip.line_sz*8;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].dcache.buffer_sizes[0]*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 2;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].memory_ports;;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          dcache.missb = new ArrayST(&interface_ip, "dcacheMissBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          dcache.area.set_area(dcache.area.get_area()+ dcache.missb->local_result.area);
+          area.set_area(area.get_area()+ dcache.missb->local_result.area);
+          //output_data_csv(dcache.missb.local_result);
+
+          //fill buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = dcache.caches->l_ip.line_sz;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+          interface_ip.cache_sz            = data*XML->sys.core[ithCore].dcache.buffer_sizes[1];
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 2;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].memory_ports;;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          dcache.ifb = new ArrayST(&interface_ip, "dcacheFillBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          dcache.area.set_area(dcache.area.get_area()+ dcache.ifb->local_result.area);
+          area.set_area(area.get_area()+ dcache.ifb->local_result.area);
+          //output_data_csv(dcache.ifb.local_result);
+
+          //prefetch buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge.
+          data                                                    = dcache.caches->l_ip.line_sz;//separate queue to prevent from cache polution.
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].dcache.buffer_sizes[2]*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 2;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = debug?1:XML->sys.core[ithCore].memory_ports;;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          dcache.prefetchb = new ArrayST(&interface_ip, "dcacheprefetchBuffer", Core_device, coredynp.opt_local, coredynp.core_ty);
+          dcache.area.set_area(dcache.area.get_area()+ dcache.prefetchb->local_result.area);
+          area.set_area(area.get_area()+ dcache.prefetchb->local_result.area);
+          //output_data_csv(dcache.prefetchb.local_result);
+
+          //WBB
+
+          if (cache_p==Write_back)
+          {
+                  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+                  data                                                    = dcache.caches->l_ip.line_sz;
+                  interface_ip.specific_tag        = 1;
+                  interface_ip.tag_w               = tag;
+                  interface_ip.line_sz             = data;
+                  interface_ip.cache_sz            = XML->sys.core[ithCore].dcache.buffer_sizes[3]*interface_ip.line_sz;
+                  interface_ip.assoc               = 0;
+                  interface_ip.nbanks              = 1;
+                  interface_ip.out_w               = interface_ip.line_sz*8;
+                  interface_ip.access_mode         = 2;
+                  interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate;
+                  interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate;
+                  interface_ip.obj_func_dyn_energy = 0;
+                  interface_ip.obj_func_dyn_power  = 0;
+                  interface_ip.obj_func_leak_power = 0;
+                  interface_ip.obj_func_cycle_t    = 1;
+                  interface_ip.num_rw_ports    = XML->sys.core[ithCore].memory_ports;
+                  interface_ip.num_rd_ports    = 0;
+                  interface_ip.num_wr_ports    = 0;
+                  interface_ip.num_se_rd_ports = 0;
+                  dcache.wbb = new ArrayST(&interface_ip, "dcacheWBB", Core_device, coredynp.opt_local, coredynp.core_ty);
+                  dcache.area.set_area(dcache.area.get_area()+ dcache.wbb->local_result.area);
+                  area.set_area(area.get_area()+ dcache.wbb->local_result.area);
+                  //output_data_csv(dcache.wbb.local_result);
+          }
+
+          /*
+           * LSU--in-order processors do not have separate load queue: unified lsq
+           * partitioned among threads
+           * it is actually the store queue but for inorder processors it serves as both loadQ and StoreQ
+           */
+          tag                                                     = ldst_opcode+XML->sys.virtual_address_width +int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) + EXTRA_TAG_BITS;
+          data                                                    = XML->sys.machine_bits;
+          interface_ip.is_cache                           = true;
+          interface_ip.line_sz             = int(ceil(data/32.0))*4;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.cache_sz            = XML->sys.core[ithCore].store_buffer_size*interface_ip.line_sz*XML->sys.core[ithCore].number_hardware_threads;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 1;
+          interface_ip.throughput          = 1.0/clockRate;
+          interface_ip.latency             = 1.0/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports        = 0;
+          interface_ip.num_rd_ports        = XML->sys.core[ithCore].memory_ports;
+          interface_ip.num_wr_ports        = XML->sys.core[ithCore].memory_ports;
+          interface_ip.num_se_rd_ports     = 0;
+          interface_ip.num_search_ports    =XML->sys.core[ithCore].memory_ports;
+          LSQ = new ArrayST(&interface_ip, "Load(Store)Queue", Core_device, coredynp.opt_local, coredynp.core_ty);
+          LSQ->area.set_area(LSQ->area.get_area()+ LSQ->local_result.area);
+          area.set_area(area.get_area()+ LSQ->local_result.area);
+          area.set_area(area.get_area()*cdb_overhead);
+          //output_data_csv(LSQ.LSQ.local_result);
+          lsq_height=LSQ->local_result.cache_ht*sqrt(cdb_overhead);/*XML->sys.core[ithCore].number_hardware_threads*/
+
+          if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0))
+          {
+                  interface_ip.line_sz             = int(ceil(data/32.0))*4;
+                  interface_ip.specific_tag        = 1;
+                  interface_ip.tag_w               = tag;
+                  interface_ip.cache_sz            = XML->sys.core[ithCore].load_buffer_size*interface_ip.line_sz*XML->sys.core[ithCore].number_hardware_threads;
+                  interface_ip.assoc               = 0;
+                  interface_ip.nbanks              = 1;
+                  interface_ip.out_w               = interface_ip.line_sz*8;
+                  interface_ip.access_mode         = 1;
+                  interface_ip.throughput          = 1.0/clockRate;
+                  interface_ip.latency             = 1.0/clockRate;
+                  interface_ip.obj_func_dyn_energy = 0;
+                  interface_ip.obj_func_dyn_power  = 0;
+                  interface_ip.obj_func_leak_power = 0;
+                  interface_ip.obj_func_cycle_t    = 1;
+                  interface_ip.num_rw_ports        = 0;
+                  interface_ip.num_rd_ports        = XML->sys.core[ithCore].memory_ports;
+                  interface_ip.num_wr_ports        = XML->sys.core[ithCore].memory_ports;
+                  interface_ip.num_se_rd_ports     = 0;
+                  interface_ip.num_search_ports    =XML->sys.core[ithCore].memory_ports;
+                  LoadQ = new ArrayST(&interface_ip, "LoadQueue", Core_device, coredynp.opt_local, coredynp.core_ty);
+                  LoadQ->area.set_area(LoadQ->area.get_area()+ LoadQ->local_result.area);
+                  area.set_area(area.get_area()+ LoadQ->local_result.area);
+                  area.set_area(area.get_area()*cdb_overhead);
+                  //output_data_csv(LoadQ.LoadQ.local_result);
+                  lsq_height=(LSQ->local_result.cache_ht + LoadQ->local_result.cache_ht)*sqrt(cdb_overhead);/*XML->sys.core[ithCore].number_hardware_threads*/
+          }
+
+}
+
+MemManU::MemManU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ itlb(0),
+ dtlb(0),
+ exist(exist_)
+{
+          if (!exist) return;
+          int  tag, data;
+          bool debug= false;
+
+          clockRate = coredynp.clockRate;
+          executionTime = coredynp.executionTime;
+          interface_ip.is_cache                           = true;
+          interface_ip.pure_cam            = false;
+          interface_ip.pure_ram            = false;
+          interface_ip.specific_tag        = 1;
+          //Itlb TLBs are partioned among threads according to Nigara and Nehalem
+          tag                                                     = XML->sys.virtual_address_width- int(floor(log2(XML->sys.virtual_memory_page_size))) + int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads)))+ EXTRA_TAG_BITS;
+          data                                                    = XML->sys.physical_address_width- int(floor(log2(XML->sys.virtual_memory_page_size)));
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].itlb.number_entries*interface_ip.line_sz;//*XML->sys.core[ithCore].number_hardware_threads;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 0;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          interface_ip.num_se_rd_ports = 0;
+          interface_ip.num_search_ports    = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;
+          itlb = new ArrayST(&interface_ip, "ITLB", Core_device, coredynp.opt_local, coredynp.core_ty);
+          itlb->area.set_area(itlb->area.get_area()+ itlb->local_result.area);
+          area.set_area(area.get_area()+ itlb->local_result.area);
+          //output_data_csv(itlb.tlb.local_result);
+
+          //dtlb
+          tag                                                     = XML->sys.virtual_address_width- int(floor(log2(XML->sys.virtual_memory_page_size))) +int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads)))+ EXTRA_TAG_BITS;
+          data                                                    = XML->sys.physical_address_width- int(floor(log2(XML->sys.virtual_memory_page_size)));
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+          interface_ip.cache_sz            = XML->sys.core[ithCore].dtlb.number_entries*interface_ip.line_sz;//*XML->sys.core[ithCore].number_hardware_threads;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate;
+          interface_ip.latency             = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 0;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = XML->sys.core[ithCore].memory_ports;
+          interface_ip.num_se_rd_ports = 0;
+          interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports;
+          dtlb = new ArrayST(&interface_ip, "DTLB", Core_device, coredynp.opt_local, coredynp.core_ty);
+          dtlb->area.set_area(dtlb->area.get_area()+ dtlb->local_result.area);
+          area.set_area(area.get_area()+ dtlb->local_result.area);
+          //output_data_csv(dtlb.tlb.local_result);
+
+}
+
+RegFU::RegFU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ IRF (0),
+ FRF (0),
+ RFWIN (0),
+ exist(exist_)
+ {
+        /*
+         * processors have separate architectural register files for each thread.
+         * therefore, the bypass buses need to travel across all the register files.
+         */
+        if (!exist) return;
+        int  data;
+
+        clockRate = coredynp.clockRate;
+        executionTime = coredynp.executionTime;
+        //**********************************IRF***************************************
+        data                                                    = coredynp.int_data_width;
+        interface_ip.is_cache                   = false;
+        interface_ip.pure_cam            = false;
+        interface_ip.pure_ram            = true;
+        interface_ip.line_sz             = int(ceil(data/32.0))*4;
+        interface_ip.cache_sz            = coredynp.num_IRF_entry*interface_ip.line_sz;
+        interface_ip.assoc               = 1;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 1;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 1;//this is the transfer port for saving/restoring states when exceptions happen.
+        interface_ip.num_rd_ports    = 2*coredynp.peak_issueW;
+        interface_ip.num_wr_ports    = coredynp.peak_issueW;
+        interface_ip.num_se_rd_ports = 0;
+        IRF = new ArrayST(&interface_ip, "Integer Register File", Core_device, coredynp.opt_local, coredynp.core_ty);
+        IRF->area.set_area(IRF->area.get_area()+ IRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_pipelines*cdb_overhead);
+        area.set_area(area.get_area()+ IRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_pipelines*cdb_overhead);
+        //area.set_area(area.get_area()*cdb_overhead);
+        //output_data_csv(IRF.RF.local_result);
+
+        //**********************************FRF***************************************
+        data                                                    = coredynp.fp_data_width;
+        interface_ip.is_cache                   = false;
+        interface_ip.pure_cam            = false;
+        interface_ip.pure_ram            = true;
+        interface_ip.line_sz             = int(ceil(data/32.0))*4;
+        interface_ip.cache_sz            = coredynp.num_FRF_entry*interface_ip.line_sz;
+        interface_ip.assoc               = 1;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.access_mode         = 1;
+        interface_ip.throughput          = 1.0/clockRate;
+        interface_ip.latency             = 1.0/clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+        interface_ip.num_rw_ports    = 1;//this is the transfer port for saving/restoring states when exceptions happen.
+        interface_ip.num_rd_ports    = 2*XML->sys.core[ithCore].issue_width;
+        interface_ip.num_wr_ports    = XML->sys.core[ithCore].issue_width;
+        interface_ip.num_se_rd_ports = 0;
+        FRF = new ArrayST(&interface_ip, "Floating point Register File", Core_device, coredynp.opt_local, coredynp.core_ty);
+        FRF->area.set_area(FRF->area.get_area()+ FRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_fp_pipelines*cdb_overhead);
+        area.set_area(area.get_area()+ FRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_fp_pipelines*cdb_overhead);
+        //area.set_area(area.get_area()*cdb_overhead);
+        //output_data_csv(FRF.RF.local_result);
+        int_regfile_height= IRF->local_result.cache_ht*XML->sys.core[ithCore].number_hardware_threads*sqrt(cdb_overhead);
+        fp_regfile_height = FRF->local_result.cache_ht*XML->sys.core[ithCore].number_hardware_threads*sqrt(cdb_overhead);
+    //since a EXU is associated with each pipeline, the cdb should not have longer length.
+        if (coredynp.regWindowing)
+        {
+                //*********************************REG_WIN************************************
+                data                                                    = coredynp.int_data_width; //ECC, and usually 2 regs are transfered together during window shifting.Niagara Mega cell
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = true;
+                interface_ip.line_sz             = int(ceil(data/8.0));
+                interface_ip.cache_sz            = XML->sys.core[ithCore].register_windows_size*IRF->l_ip.cache_sz*XML->sys.core[ithCore].number_hardware_threads;
+                interface_ip.assoc               = 1;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 4.0/clockRate;
+                interface_ip.latency             = 4.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports    = 1;//this is the transfer port for saving/restoring states when exceptions happen.
+                interface_ip.num_rd_ports    = 0;
+                interface_ip.num_wr_ports    = 0;
+                interface_ip.num_se_rd_ports = 0;
+                RFWIN = new ArrayST(&interface_ip, "RegWindow", Core_device, coredynp.opt_local, coredynp.core_ty);
+                RFWIN->area.set_area(RFWIN->area.get_area()+ RFWIN->local_result.area*coredynp.num_pipelines);
+                area.set_area(area.get_area()+ RFWIN->local_result.area*coredynp.num_pipelines);
+                //output_data_csv(RFWIN.RF.local_result);
+        }
+
+
+ }
+
+EXECU::EXECU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, double lsq_height_, const CoreDynParam & dyn_p_, bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ lsq_height(lsq_height_),
+ coredynp(dyn_p_),
+ rfu(0),
+ scheu(0),
+ fp_u(0),
+ exeu(0),
+ mul(0),
+ int_bypass(0),
+ intTagBypass(0),
+ int_mul_bypass(0),
+ intTag_mul_Bypass(0),
+ fp_bypass(0),
+ fpTagBypass(0),
+ exist(exist_)
+{
+          if (!exist) return;
+          double fu_height = 0.0;
+      clockRate = coredynp.clockRate;
+      executionTime = coredynp.executionTime;
+          rfu   = new RegFU(XML, ithCore, &interface_ip,coredynp);
+          scheu = new SchedulerU(XML, ithCore, &interface_ip,coredynp);
+          exeu  = new FunctionalUnit(XML, ithCore,&interface_ip, coredynp, ALU);
+          area.set_area(area.get_area()+ exeu->area.get_area() + rfu->area.get_area() +scheu->area.get_area() );
+          fu_height = exeu->FU_height;
+          if (coredynp.num_fpus >0)
+          {
+                  fp_u  = new FunctionalUnit(XML, ithCore,&interface_ip, coredynp, FPU);
+                  area.set_area(area.get_area()+ fp_u->area.get_area());
+          }
+          if (coredynp.num_muls >0)
+          {
+                  mul   = new FunctionalUnit(XML, ithCore,&interface_ip, coredynp, MUL);
+                  area.set_area(area.get_area()+ mul->area.get_area());
+                  fu_height +=  mul->FU_height;
+          }
+          /*
+           * broadcast logic, including int-broadcast; int_tag-broadcast; fp-broadcast; fp_tag-broadcast
+           * integer by pass has two paths and fp has 3 paths.
+           * on the same bus there are multiple tri-state drivers and muxes that go to different components on the same bus
+           */
+          if (XML->sys.Embedded)
+                        {
+                        interface_ip.wt                  =Global_30;
+                        interface_ip.wire_is_mat_type = 0;
+                        interface_ip.wire_os_mat_type = 0;
+                    interface_ip.throughput       = 1.0/clockRate;
+                    interface_ip.latency          = 1.0/clockRate;
+                        }
+                else
+                        {
+                        interface_ip.wt                  =Global;
+                        interface_ip.wire_is_mat_type = 2;//start from semi-global since local wires are already used
+                        interface_ip.wire_os_mat_type = 2;
+                    interface_ip.throughput       = 10.0/clockRate; //Do not care
+                    interface_ip.latency          = 10.0/clockRate;
+                        }
+
+          if (coredynp.core_ty==Inorder)
+          {
+                  int_bypass   = new interconnect("Int Bypass Data", Core_device, 1, 1, int(ceil(XML->sys.machine_bits/32.0)*32),
+                                  rfu->int_regfile_height + exeu->FU_height + lsq_height, &interface_ip, 3,
+                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                  bypass.area.set_area(bypass.area.get_area() + int_bypass->area.get_area());
+                  intTagBypass = new interconnect("Int Bypass tag" , Core_device, 1, 1, coredynp.perThreadState,
+                                  rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3,
+                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                  bypass.area.set_area(bypass.area.get_area()  +intTagBypass->area.get_area());
+
+                  if (coredynp.num_muls>0)
+                  {
+                          int_mul_bypass     = new interconnect("Mul Bypass Data" , Core_device, 1, 1, int(ceil(XML->sys.machine_bits/32.0)*32*1.5),
+                                          rfu->fp_regfile_height + exeu->FU_height + mul->FU_height + lsq_height, &interface_ip, 3,
+                                          false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          bypass.area.set_area(bypass.area.get_area()  +int_mul_bypass->area.get_area());
+                          intTag_mul_Bypass  = new interconnect("Mul Bypass tag"  , Core_device, 1, 1, coredynp.perThreadState,
+                                          rfu->fp_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3,
+                                          false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          bypass.area.set_area(bypass.area.get_area()  +intTag_mul_Bypass->area.get_area());
+                  }
+
+                  if (coredynp.num_fpus>0)
+                  {
+                          fp_bypass    = new interconnect("FP Bypass Data" , Core_device, 1, 1, int(ceil(XML->sys.machine_bits/32.0)*32*1.5),
+                                          rfu->fp_regfile_height + fp_u->FU_height, &interface_ip, 3,
+                                          false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          bypass.area.set_area(bypass.area.get_area()  +fp_bypass->area.get_area());
+                          fpTagBypass  = new interconnect("FP Bypass tag"  , Core_device, 1, 1, coredynp.perThreadState,
+                                          rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3,
+                                          false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          bypass.area.set_area(bypass.area.get_area()  +fpTagBypass->area.get_area());
+                  }
+          }
+          else
+          {//OOO
+                  if (coredynp.scheu_ty==PhysicalRegFile)
+                  {
+                          /* For physical register based OOO,
+                           * data broadcast interconnects cover across functional units, lsq, inst windows and register files,
+                           * while tag broadcast interconnects also cover across ROB
+                           */
+                          int_bypass   = new interconnect("Int Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)),
+                                                    rfu->int_regfile_height + exeu->FU_height + lsq_height, &interface_ip, 3,
+                                                                false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          bypass.area.set_area(bypass.area.get_area()  +int_bypass->area.get_area());
+                          intTagBypass = new interconnect("Int Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width,
+                                                    rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3,
+                                                                false, 1.0, coredynp.opt_local, coredynp.core_ty);
+
+                          if (coredynp.num_muls>0)
+                          {
+                                  int_mul_bypass   = new interconnect("Mul Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)),
+                                                                                rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height, &interface_ip, 3,
+                                                                                false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  intTag_mul_Bypass = new interconnect("Mul Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width,
+                                                                                rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3,
+                                                                                false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  bypass.area.set_area(bypass.area.get_area()  +int_mul_bypass->area.get_area());
+                                  bypass.area.set_area(bypass.area.get_area()  +intTag_mul_Bypass->area.get_area());
+                          }
+
+                          if (coredynp.num_fpus>0)
+                          {
+                                  fp_bypass    = new interconnect("FP Bypass Data" , Core_device, 1, 1, int(ceil(coredynp.fp_data_width)),
+                                                                  rfu->fp_regfile_height + fp_u->FU_height, &interface_ip, 3,
+                                                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  fpTagBypass  = new interconnect("FP Bypass tag"  , Core_device, 1, 1, coredynp.phy_freg_width,
+                                                                  rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->fp_Iw_height + scheu->ROB_height, &interface_ip, 3,
+                                                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  bypass.area.set_area(bypass.area.get_area()  +fp_bypass->area.get_area());
+                                  bypass.area.set_area(bypass.area.get_area()  +fpTagBypass->area.get_area());
+                          }
+                  }
+                  else
+                  {
+             /*
+              * In RS based processor both data and tag are broadcast together,
+              * covering functional units, lsq, nst windows, register files, and ROBs
+              */
+                          int_bypass   = new interconnect("Int Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)),
+                                                    rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height, &interface_ip, 3,
+                                                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          intTagBypass = new interconnect("Int Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width,
+                                                    rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3,
+                                                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                          bypass.area.set_area(bypass.area.get_area() +int_bypass->area.get_area());
+                          bypass.area.set_area(bypass.area.get_area() +intTagBypass->area.get_area());
+                          if (coredynp.num_muls>0)
+                          {
+                                  int_mul_bypass   = new interconnect("Mul Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)),
+                                                            rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height, &interface_ip, 3,
+                                                                          false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  intTag_mul_Bypass = new interconnect("Mul Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width,
+                                                            rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3,
+                                                                          false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  bypass.area.set_area(bypass.area.get_area() +int_mul_bypass->area.get_area());
+                                  bypass.area.set_area(bypass.area.get_area() +intTag_mul_Bypass->area.get_area());
+                          }
+
+                          if (coredynp.num_fpus>0)
+                          {
+                                  fp_bypass    = new interconnect("FP Bypass Data" , Core_device, 1, 1, int(ceil(coredynp.fp_data_width)),
+                                                  rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->fp_Iw_height + scheu->ROB_height, &interface_ip, 3,
+                                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  fpTagBypass  = new interconnect("FP Bypass tag"  , Core_device, 1, 1, coredynp.phy_freg_width,
+                                                  rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->fp_Iw_height + scheu->ROB_height, &interface_ip, 3,
+                                                  false, 1.0, coredynp.opt_local, coredynp.core_ty);
+                                  bypass.area.set_area(bypass.area.get_area() +fp_bypass->area.get_area());
+                                  bypass.area.set_area(bypass.area.get_area() +fpTagBypass->area.get_area());
+                          }
+                  }
+
+
+          }
+          area.set_area(area.get_area()+ bypass.area.get_area());
+}
+
+RENAMINGU::RENAMINGU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ iFRAT(0),
+ fFRAT(0),
+ iRRAT(0),
+ fRRAT(0),
+ ifreeL(0),
+ ffreeL(0),
+ idcl(0),
+ fdcl(0),
+ RAHT(0),
+ exist(exist_)
+ {
+        /*
+         * Although renaming logic maybe be used in in-order processors,
+     * McPAT assumes no renaming logic is used since the performance gain is very limited and
+     * the only major inorder processor with renaming logic is Itainium
+     * that is a VLIW processor and different from current McPAT's model.
+         * physical register base OOO must have Dual-RAT architecture or equivalent structure.FRAT:FrontRAT, RRAT:RetireRAT;
+         * i,f prefix mean int and fp
+         * RAT for all Renaming logic, random accessible checkpointing is used, but only update when instruction retires.
+         * FRAT will be read twice and written once per instruction;
+         * RRAT will be write once per instruction when committing and reads out all when context switch
+         * checkpointing is implicit
+         * Renaming logic is duplicated for each different hardware threads
+         *
+         * No Dual-RAT is needed in RS-based OOO processors,
+         * however, RAT needs to do associative search in RAT, when instruction commits and ROB release the entry,
+         * to make sure all the renamings associated with the ROB to be released are updated at the same time.
+         * RAM scheme has # ARchi Reg entry with each entry hold phy reg tag,
+         * CAM scheme has # Phy Reg entry with each entry hold ARchi reg tag,
+         *
+         * Both RAM and CAM have same DCL
+         */
+        if (!exist) return;
+        int  tag, data, out_w;
+//     interface_ip.wire_is_mat_type = 0;
+//     interface_ip.wire_os_mat_type = 0;
+//     interface_ip.wt               = Global_30;
+        clockRate = coredynp.clockRate;
+        executionTime = coredynp.executionTime;
+    if (coredynp.core_ty==OOO)
+    {
+        //integer pipeline
+        if (coredynp.scheu_ty==PhysicalRegFile)
+        {
+                if (coredynp.rm_ty ==RAMbased)
+                {        //FRAT with global checkpointing (GCs) please see paper tech report for detailed explaintions
+                        data                                                    = 33;//int(ceil(coredynp.phy_ireg_width*(1+coredynp.globalCheckpoint)/8.0));
+//                     data                                                     = int(ceil(coredynp.phy_ireg_width/8.0));
+                        out_w                            = 1;//int(ceil(coredynp.phy_ireg_width/8.0));
+                        interface_ip.is_cache                   = false;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = true;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].archi_Regs_IRF_size;
+                        interface_ip.assoc               = 1;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//the extra one port is for GCs
+                        interface_ip.num_rd_ports    = 2*coredynp.decodeW;
+                        interface_ip.num_wr_ports    = coredynp.decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ iFRAT->area.get_area());
+
+//                     //RAHT According to Intel, combine GC with FRAT is very costly.
+//                     data                                                     = int(ceil(coredynp.phy_ireg_width/8.0)*coredynp.num_IRF_entry);
+//                     out_w                            = data;
+//                     interface_ip.is_cache                    = false;
+//                     interface_ip.pure_cam            = false;
+//                     interface_ip.pure_ram            = true;
+//                     interface_ip.line_sz             = data;
+//                     interface_ip.cache_sz            = data*coredynp.globalCheckpoint;
+//                     interface_ip.assoc               = 1;
+//                     interface_ip.nbanks              = 1;
+//                     interface_ip.out_w               = out_w*8;
+//                     interface_ip.access_mode         = 0;
+//                     interface_ip.throughput          = 1.0/clockRate;
+//                     interface_ip.latency             = 1.0/clockRate;
+//                     interface_ip.obj_func_dyn_energy = 0;
+//                     interface_ip.obj_func_dyn_power  = 0;
+//                     interface_ip.obj_func_leak_power = 0;
+//                     interface_ip.obj_func_cycle_t    = 1;
+//                     interface_ip.num_rw_ports    = 1;//the extra one port is for GCs
+//                     interface_ip.num_rd_ports    = 2*coredynp.decodeW;
+//                     interface_ip.num_wr_ports    = coredynp.decodeW;
+//                     interface_ip.num_se_rd_ports = 0;
+//                     iFRAT = new ArrayST(&interface_ip, "Int FrontRAT");
+//                     iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+//                     area.set_area(area.get_area()+ iFRAT->area.get_area());
+
+                        //FRAT floating point
+                        data                                                    = int(ceil(coredynp.phy_freg_width*(1+coredynp.globalCheckpoint)/8.0));
+                        out_w                            = int(ceil(coredynp.phy_freg_width/8.0));
+                        interface_ip.is_cache                   = false;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = true;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].archi_Regs_FRF_size;
+                        interface_ip.assoc               = 1;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//the extra one port is for GCs
+                        interface_ip.num_rd_ports    = 2*coredynp.fp_decodeW;
+                        interface_ip.num_wr_ports    = coredynp.fp_decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ fFRAT->area.get_area());
+
+                }
+                else if ((coredynp.rm_ty ==CAMbased))
+                {
+                        //FRAT
+                        tag                                                         = coredynp.arch_ireg_width;
+                        data                                                    = int(ceil ((coredynp.arch_ireg_width+1*coredynp.globalCheckpoint)/8.0));//the address of CAM needed to be sent out
+                        out_w                            = int(ceil (coredynp.arch_ireg_width/8.0));
+                        interface_ip.is_cache                   = true;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = false;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].phy_Regs_IRF_size;
+                        interface_ip.assoc               = 0;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.specific_tag        = 1;
+                        interface_ip.tag_w               = tag;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//for GCs
+                        interface_ip.num_rd_ports    = coredynp.decodeW;
+                        interface_ip.num_wr_ports    = coredynp.decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        interface_ip.num_search_ports= 2*coredynp.decodeW;
+                        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ iFRAT->area.get_area());
+
+                        //FRAT for FP
+                        tag                                                         = coredynp.arch_freg_width;
+                        data                                                    = int(ceil ((coredynp.arch_freg_width+1*coredynp.globalCheckpoint)/8.0));//the address of CAM needed to be sent out
+                        out_w                            = int(ceil (coredynp.arch_freg_width/8.0));
+                        interface_ip.is_cache                   = true;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = false;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].phy_Regs_FRF_size;
+                        interface_ip.assoc               = 0;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.specific_tag        = 1;
+                        interface_ip.tag_w               = tag;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//for GCs
+                        interface_ip.num_rd_ports    = coredynp.fp_decodeW;
+                        interface_ip.num_wr_ports    = coredynp.fp_decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        interface_ip.num_search_ports= 2*coredynp.fp_decodeW;
+                        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ fFRAT->area.get_area());
+
+                }
+
+                //RRAT is always RAM based, does not have GCs, and is used only for record latest non-speculative mapping
+                data                                                    = int(ceil(coredynp.phy_ireg_width/8.0));
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = true;
+                interface_ip.line_sz             = data;
+                interface_ip.cache_sz            = data*XML->sys.core[ithCore].archi_Regs_IRF_size*2;//HACK to make it as least 64B
+                interface_ip.assoc               = 1;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 1.0/clockRate;
+                interface_ip.latency             = 1.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports    = 0;
+                interface_ip.num_rd_ports    = XML->sys.core[ithCore].commit_width;
+                interface_ip.num_wr_ports    = XML->sys.core[ithCore].commit_width;
+                interface_ip.num_se_rd_ports = 0;
+                iRRAT = new ArrayST(&interface_ip, "Int RetireRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                iRRAT->area.set_area(iRRAT->area.get_area()+ iRRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                area.set_area(area.get_area()+ iRRAT->area.get_area());
+
+                //RRAT for FP
+                data                                                    = int(ceil(coredynp.phy_freg_width/8.0));
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = true;
+                interface_ip.line_sz             = data;
+                interface_ip.cache_sz            = data*XML->sys.core[ithCore].archi_Regs_FRF_size*2;//HACK to make it as least 64B
+                interface_ip.assoc               = 1;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 1.0/clockRate;
+                interface_ip.latency             = 1.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports    = 0;
+                interface_ip.num_rd_ports    = coredynp.fp_decodeW;
+                interface_ip.num_wr_ports    = coredynp.fp_decodeW;
+                interface_ip.num_se_rd_ports = 0;
+                fRRAT = new ArrayST(&interface_ip, "Int RetireRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                fRRAT->area.set_area(fRRAT->area.get_area()+ fRRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                area.set_area(area.get_area()+ fRRAT->area.get_area());
+
+                //Freelist of renaming unit always RAM based
+                //Recycle happens at two places: 1)when DCL check there are WAW, the Phyregisters/ROB directly recycles into freelist
+                // 2)When instruction commits the Phyregisters/ROB needed to be recycled.
+                //therefore num_wr port = decode-1(-1 means at least one phy reg will be used for the current renaming group) + commit width
+                data                                                    = int(ceil(coredynp.phy_ireg_width/8.0));
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = true;
+                interface_ip.line_sz             = data;
+                interface_ip.cache_sz            = data*coredynp.num_ifreelist_entries;
+                interface_ip.assoc               = 1;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 1.0/clockRate;
+                interface_ip.latency             = 1.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports    = 1;//TODO
+                interface_ip.num_rd_ports    = coredynp.decodeW;
+                interface_ip.num_wr_ports    = coredynp.decodeW -1 + XML->sys.core[ithCore].commit_width;
+                //every cycle, (coredynp.decodeW -1) inst may need to send back it dest tags, committW insts needs to update freelist buffers
+                interface_ip.num_se_rd_ports = 0;
+                ifreeL = new ArrayST(&interface_ip, "Int Free List", Core_device, coredynp.opt_local, coredynp.core_ty);
+                ifreeL->area.set_area(ifreeL->area.get_area()+ ifreeL->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                area.set_area(area.get_area()+ ifreeL->area.get_area());
+
+                //freelist for FP
+                data                                                    = int(ceil(coredynp.phy_freg_width/8.0));
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = true;
+                interface_ip.line_sz             = data;
+                interface_ip.cache_sz            = data*coredynp.num_ffreelist_entries;
+                interface_ip.assoc               = 1;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 1.0/clockRate;
+                interface_ip.latency             = 1.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports    = 1;
+                interface_ip.num_rd_ports    = coredynp.fp_decodeW;
+                interface_ip.num_wr_ports    = coredynp.fp_decodeW -1 + XML->sys.core[ithCore].commit_width;
+                interface_ip.num_se_rd_ports = 0;
+                ffreeL = new ArrayST(&interface_ip, "Int Free List", Core_device, coredynp.opt_local, coredynp.core_ty);
+                ffreeL->area.set_area(ffreeL->area.get_area()+ ffreeL->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                area.set_area(area.get_area()+ ffreeL->area.get_area());
+
+                idcl  = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_ireg_width);//TODO:Separate 2 sections See TR
+                fdcl  = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_freg_width);
+
+        }
+        else if (coredynp.scheu_ty==ReservationStation){
+                if (coredynp.rm_ty ==RAMbased){
+                        /*
+                         * however, RAT needs to do associative search in RAT, when instruction commits and ROB release the entry,
+                         * to make sure all the renamings associated with the ROB to be released are updated to ARF at the same time.
+                         * RAM based RAT for RS base OOO does not save the search operations. Its advantage is to have less entries than
+                         * CAM based RAT so that it is more scalable as number of ROB/physical regs increases.
+                         */
+                        tag                                                         = coredynp.phy_ireg_width;
+                        data                                                    = int(ceil(coredynp.phy_ireg_width*(1+coredynp.globalCheckpoint)/8.0));
+                        out_w                            = int(ceil(coredynp.phy_ireg_width/8.0));
+                        interface_ip.is_cache                   = true;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = false;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].archi_Regs_IRF_size;
+                        interface_ip.assoc               = 0;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//the extra one port is for GCs
+                        interface_ip.num_rd_ports    = 2*coredynp.decodeW;
+                        interface_ip.num_wr_ports    = coredynp.decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        interface_ip.num_search_ports= coredynp.commitW;//TODO
+                        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        iFRAT->local_result.adjust_area();
+                        iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ iFRAT->area.get_area());
+
+                        //FP
+                        tag                                                         = coredynp.phy_freg_width;
+                        data                                                    = int(ceil(coredynp.phy_freg_width*(1+coredynp.globalCheckpoint)/8.0));
+                        out_w                            = int(ceil(coredynp.phy_freg_width/8.0));
+                        interface_ip.is_cache                   = true;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = false;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].archi_Regs_FRF_size;
+                        interface_ip.assoc               = 0;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//the extra one port is for GCs
+                        interface_ip.num_rd_ports    = 2*coredynp.fp_decodeW;
+                        interface_ip.num_wr_ports    = coredynp.fp_decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        interface_ip.num_search_ports= coredynp.fp_decodeW;//actually is fp commit width
+                        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        fFRAT->local_result.adjust_area();
+                        fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ fFRAT->area.get_area());
+
+                }
+                else if ((coredynp.rm_ty ==CAMbased))
+                {
+                        //FRAT
+                        tag                                                         = coredynp.arch_ireg_width;
+                        data                                                    = int(ceil (coredynp.arch_ireg_width+1*coredynp.globalCheckpoint/8.0));//the address of CAM needed to be sent out
+                        out_w                            = int(ceil (coredynp.arch_ireg_width/8.0));
+                        interface_ip.is_cache                   = true;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = false;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].phy_Regs_IRF_size;
+                        interface_ip.assoc               = 0;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.specific_tag        = 1;
+                        interface_ip.tag_w               = tag;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//for GCs
+                        interface_ip.num_rd_ports    = XML->sys.core[ithCore].decode_width;//0;TODO
+                        interface_ip.num_wr_ports    = XML->sys.core[ithCore].decode_width;
+                        interface_ip.num_se_rd_ports = 0;
+                        interface_ip.num_search_ports= 2*XML->sys.core[ithCore].decode_width;
+                        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ iFRAT->area.get_area());
+
+                        //FRAT
+                        tag                                                         = coredynp.arch_freg_width;
+                        data                                                    = int(ceil (coredynp.arch_freg_width+1*coredynp.globalCheckpoint/8.0));//the address of CAM needed to be sent out
+                        out_w                            = int(ceil (coredynp.arch_freg_width/8.0));
+                        interface_ip.is_cache                   = true;
+                        interface_ip.pure_cam            = false;
+                        interface_ip.pure_ram            = false;
+                        interface_ip.line_sz             = data;
+                        interface_ip.cache_sz            = data*XML->sys.core[ithCore].phy_Regs_FRF_size;
+                        interface_ip.assoc               = 0;
+                        interface_ip.nbanks              = 1;
+                        interface_ip.out_w               = out_w*8;
+                        interface_ip.specific_tag        = 1;
+                        interface_ip.tag_w               = tag;
+                        interface_ip.access_mode         = 2;
+                        interface_ip.throughput          = 1.0/clockRate;
+                        interface_ip.latency             = 1.0/clockRate;
+                        interface_ip.obj_func_dyn_energy = 0;
+                        interface_ip.obj_func_dyn_power  = 0;
+                        interface_ip.obj_func_leak_power = 0;
+                        interface_ip.obj_func_cycle_t    = 1;
+                        interface_ip.num_rw_ports    = 1;//for GCs
+                        interface_ip.num_rd_ports    = XML->sys.core[ithCore].decode_width;//0;TODO;
+                        interface_ip.num_wr_ports    = coredynp.fp_decodeW;
+                        interface_ip.num_se_rd_ports = 0;
+                        interface_ip.num_search_ports= 2*coredynp.fp_decodeW;
+                        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty);
+                        fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                        area.set_area(area.get_area()+ fFRAT->area.get_area());
+
+                }
+                //No RRAT for RS based OOO
+                //Freelist of renaming unit of RS based OOO is unifed for both int and fp renaming unit since the ROB is unified
+                data                                                    = int(ceil(coredynp.phy_ireg_width/8.0));
+                interface_ip.is_cache                   = false;
+                interface_ip.pure_cam            = false;
+                interface_ip.pure_ram            = true;
+                interface_ip.line_sz             = data;
+                interface_ip.cache_sz            = data*coredynp.num_ifreelist_entries;
+                interface_ip.assoc               = 1;
+                interface_ip.nbanks              = 1;
+                interface_ip.out_w               = interface_ip.line_sz*8;
+                interface_ip.access_mode         = 1;
+                interface_ip.throughput          = 1.0/clockRate;
+                interface_ip.latency             = 1.0/clockRate;
+                interface_ip.obj_func_dyn_energy = 0;
+                interface_ip.obj_func_dyn_power  = 0;
+                interface_ip.obj_func_leak_power = 0;
+                interface_ip.obj_func_cycle_t    = 1;
+                interface_ip.num_rw_ports    = 1;//TODO
+                interface_ip.num_rd_ports    = XML->sys.core[ithCore].decode_width;
+                interface_ip.num_wr_ports    = XML->sys.core[ithCore].decode_width -1 + XML->sys.core[ithCore].commit_width;
+                interface_ip.num_se_rd_ports = 0;
+                ifreeL = new ArrayST(&interface_ip, "Unified Free List", Core_device, coredynp.opt_local, coredynp.core_ty);
+                ifreeL->area.set_area(ifreeL->area.get_area()+ ifreeL->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+                area.set_area(area.get_area()+ ifreeL->area.get_area());
+
+                idcl  = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_ireg_width);//TODO:Separate 2 sections See TR
+                fdcl  = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_freg_width);
+        }
+
+}
+    if (coredynp.core_ty==Inorder&& coredynp.issueW>1)
+    {
+          /* Dependency check logic will only present when decode(issue) width>1.
+          *  Multiple issue in order processor can do without renaming, but dcl is a must.
+          */
+        idcl  = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_ireg_width);//TODO:Separate 2 sections See TR
+        fdcl  = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_freg_width);
+    }
+}
+
+Core::Core(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ ifu  (0),
+ lsu  (0),
+ mmu  (0),
+ exu  (0),
+ rnu  (0),
+ corepipe (0),
+ undiffCore (0),
+ l2cache (0)
+{
+ /*
+  * initialize, compute and optimize individual components.
+  */
+
+  double pipeline_area_per_unit;
+  if (XML->sys.Private_L2)
+  {
+          l2cache = new SharedCache(XML,ithCore, &interface_ip);
+
+  }
+//  interface_ip.wire_is_mat_type = 2;
+//  interface_ip.wire_os_mat_type = 2;
+//  interface_ip.wt               =Global_30;
+  set_core_param();
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  ifu          = new InstFetchU(XML, ithCore, &interface_ip,coredynp);
+  lsu          = new LoadStoreU(XML, ithCore, &interface_ip,coredynp);
+  mmu          = new MemManU   (XML, ithCore, &interface_ip,coredynp);
+  exu          = new EXECU     (XML, ithCore, &interface_ip,lsu->lsq_height, coredynp);
+  undiffCore   = new UndiffCore(XML, ithCore, &interface_ip,coredynp);
+  if (coredynp.core_ty==OOO)
+  {
+          rnu = new RENAMINGU(XML, ithCore, &interface_ip,coredynp);
+  }
+  corepipe = new Pipeline(&interface_ip,coredynp);
+
+  if (coredynp.core_ty==OOO)
+  {
+          pipeline_area_per_unit    = (corepipe->area.get_area()*coredynp.num_pipelines)/5.0;
+          if (rnu->exist)
+          {
+                  rnu->area.set_area(rnu->area.get_area() + pipeline_area_per_unit);
+          }
+  }
+  else {
+          pipeline_area_per_unit    = (corepipe->area.get_area()*coredynp.num_pipelines)/4.0;
+  }
+
+  //area.set_area(area.get_area()+ corepipe->area.get_area());
+  if (ifu->exist)
+  {
+          ifu->area.set_area(ifu->area.get_area() + pipeline_area_per_unit);
+          area.set_area(area.get_area() + ifu->area.get_area());
+  }
+  if (lsu->exist)
+  {
+          lsu->area.set_area(lsu->area.get_area() + pipeline_area_per_unit);
+      area.set_area(area.get_area() + lsu->area.get_area());
+  }
+  if (exu->exist)
+  {
+          exu->area.set_area(exu->area.get_area() + pipeline_area_per_unit);
+          area.set_area(area.get_area()+exu->area.get_area());
+  }
+  if (mmu->exist)
+  {
+          mmu->area.set_area(mmu->area.get_area() + pipeline_area_per_unit);
+      area.set_area(area.get_area()+mmu->area.get_area());
+  }
+
+  if (coredynp.core_ty==OOO)
+  {
+          if (rnu->exist)
+          {
+
+                  area.set_area(area.get_area() + rnu->area.get_area());
+          }
+  }
+
+  if (undiffCore->exist)
+  {
+          area.set_area(area.get_area() + undiffCore->area.get_area());
+  }
+
+  if (XML->sys.Private_L2)
+  {
+          area.set_area(area.get_area() + l2cache->area.get_area());
+
+  }
+//  //clock power
+//  clockNetwork.init_wire_external(is_default, &interface_ip);
+//  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
+//  clockNetwork.end_wiring_level   =5;//toplevel metal
+//  clockNetwork.start_wiring_level =5;//toplevel metal
+//  clockNetwork.num_regs           = corepipe.tot_stage_vector;
+//  clockNetwork.optimize_wire();
+}
+
+
+void BranchPredictor::computeEnergy(bool is_tdp)
+{
+        if (!exist) return;
+        double r_access;
+        double w_access;
+        if (is_tdp)
+    {
+        r_access = coredynp.predictionW*coredynp.BR_duty_cycle;
+        w_access = 0*coredynp.BR_duty_cycle;
+        globalBPT->stats_t.readAc.access  = r_access;
+        globalBPT->stats_t.writeAc.access = w_access;
+        globalBPT->tdp_stats = globalBPT->stats_t;
+
+        L1_localBPT->stats_t.readAc.access  = r_access;
+        L1_localBPT->stats_t.writeAc.access = w_access;
+        L1_localBPT->tdp_stats = L1_localBPT->stats_t;
+
+        L2_localBPT->stats_t.readAc.access  = r_access;
+        L2_localBPT->stats_t.writeAc.access = w_access;
+        L2_localBPT->tdp_stats = L2_localBPT->stats_t;
+
+        chooser->stats_t.readAc.access  = r_access;
+        chooser->stats_t.writeAc.access = w_access;
+        chooser->tdp_stats = chooser->stats_t;
+
+        RAS->stats_t.readAc.access  = r_access;
+        RAS->stats_t.writeAc.access = w_access;
+        RAS->tdp_stats = RAS->stats_t;
+    }
+    else
+    {
+        //The resolution of BPT accesses is coarse, but this is
+        //because most simulators cannot track finer grained details
+        r_access = XML->sys.core[ithCore].branch_instructions;
+        w_access = XML->sys.core[ithCore].branch_mispredictions + 0.1*XML->sys.core[ithCore].branch_instructions;//10% of BR will flip internal bits//0
+        globalBPT->stats_t.readAc.access  = r_access;
+        globalBPT->stats_t.writeAc.access = w_access;
+        globalBPT->rtp_stats = globalBPT->stats_t;
+
+        L1_localBPT->stats_t.readAc.access  = r_access;
+        L1_localBPT->stats_t.writeAc.access = w_access;
+        L1_localBPT->rtp_stats = L1_localBPT->stats_t;
+
+        L2_localBPT->stats_t.readAc.access  = r_access;
+        L2_localBPT->stats_t.writeAc.access = w_access;
+        L2_localBPT->rtp_stats = L2_localBPT->stats_t;
+
+        chooser->stats_t.readAc.access  = r_access;
+        chooser->stats_t.writeAc.access = w_access;
+        chooser->rtp_stats = chooser->stats_t;
+
+        RAS->stats_t.readAc.access  = XML->sys.core[ithCore].function_calls;
+        RAS->stats_t.writeAc.access = XML->sys.core[ithCore].function_calls;
+        RAS->rtp_stats = RAS->stats_t;
+   }
+
+        globalBPT->power_t.reset();
+        L1_localBPT->power_t.reset();
+        L2_localBPT->power_t.reset();
+        chooser->power_t.reset();
+        RAS->power_t.reset();
+
+    globalBPT->power_t.readOp.dynamic   +=  globalBPT->local_result.power.readOp.dynamic*globalBPT->stats_t.readAc.access +
+                globalBPT->stats_t.writeAc.access*globalBPT->local_result.power.writeOp.dynamic;
+    L1_localBPT->power_t.readOp.dynamic   +=  L1_localBPT->local_result.power.readOp.dynamic*L1_localBPT->stats_t.readAc.access +
+                L1_localBPT->stats_t.writeAc.access*L1_localBPT->local_result.power.writeOp.dynamic;
+
+    L2_localBPT->power_t.readOp.dynamic   +=  L2_localBPT->local_result.power.readOp.dynamic*L2_localBPT->stats_t.readAc.access +
+                L2_localBPT->stats_t.writeAc.access*L2_localBPT->local_result.power.writeOp.dynamic;
+
+    chooser->power_t.readOp.dynamic   +=  chooser->local_result.power.readOp.dynamic*chooser->stats_t.readAc.access +
+                chooser->stats_t.writeAc.access*chooser->local_result.power.writeOp.dynamic;
+    RAS->power_t.readOp.dynamic   +=  RAS->local_result.power.readOp.dynamic*RAS->stats_t.readAc.access +
+                RAS->stats_t.writeAc.access*RAS->local_result.power.writeOp.dynamic;
+
+    if (is_tdp)
+    {
+        globalBPT->power = globalBPT->power_t + globalBPT->local_result.power*pppm_lkg;
+        L1_localBPT->power = L1_localBPT->power_t + L1_localBPT->local_result.power*pppm_lkg;
+        L2_localBPT->power = L2_localBPT->power_t + L2_localBPT->local_result.power*pppm_lkg;
+        chooser->power = chooser->power_t + chooser->local_result.power*pppm_lkg;
+        RAS->power = RAS->power_t + RAS->local_result.power*coredynp.pppm_lkg_multhread;
+
+        power = power + globalBPT->power + L1_localBPT->power + chooser->power + RAS->power;
+    }
+    else
+    {
+        globalBPT->rt_power = globalBPT->power_t + globalBPT->local_result.power*pppm_lkg;
+        L1_localBPT->rt_power = L1_localBPT->power_t + L1_localBPT->local_result.power*pppm_lkg;
+        L2_localBPT->rt_power = L2_localBPT->power_t + L2_localBPT->local_result.power*pppm_lkg;
+        chooser->rt_power = chooser->power_t + chooser->local_result.power*pppm_lkg;
+        RAS->rt_power = RAS->power_t + RAS->local_result.power*coredynp.pppm_lkg_multhread;
+        rt_power = rt_power + globalBPT->rt_power + L1_localBPT->rt_power + chooser->rt_power + RAS->rt_power;
+    }
+}
+
+void BranchPredictor::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+        if (is_tdp)
+        {
+                cout << indent_str<< "Global Predictor:" << endl;
+                cout << indent_str_next << "Area = " << globalBPT->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << globalBPT->power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? globalBPT->power.readOp.longer_channel_leakage:globalBPT->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << globalBPT->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << globalBPT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "Local Predictor:" << endl;
+                cout << indent_str << "L1_Local Predictor:" << endl;
+                cout << indent_str_next << "Area = " << L1_localBPT->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << L1_localBPT->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? L1_localBPT->power.readOp.longer_channel_leakage:L1_localBPT->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << L1_localBPT->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << L1_localBPT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "L2_Local Predictor:" << endl;
+                cout << indent_str_next << "Area = " << L2_localBPT->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << L2_localBPT->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? L2_localBPT->power.readOp.longer_channel_leakage:L2_localBPT->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << L2_localBPT->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << L2_localBPT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+
+                cout << indent_str << "Chooser:" << endl;
+                cout << indent_str_next << "Area = " << chooser->area.get_area()  *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << chooser->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? chooser->power.readOp.longer_channel_leakage:chooser->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << chooser->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << chooser->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "RAS:" << endl;
+                cout << indent_str_next << "Area = " << RAS->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << RAS->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? RAS->power.readOp.longer_channel_leakage:RAS->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << RAS->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << RAS->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+//             cout << indent_str_next << "Global Predictor    Peak Dynamic = " << globalBPT->rt_power.readOp.dynamic*clockRate << " W" << endl;
+//             cout << indent_str_next << "Global Predictor    Subthreshold Leakage = " << globalBPT->rt_power.readOp.leakage <<" W" << endl;
+//             cout << indent_str_next << "Global Predictor    Gate Leakage = " << globalBPT->rt_power.readOp.gate_leakage << " W" << endl;
+//             cout << indent_str_next << "Local Predictor   Peak Dynamic = " << L1_localBPT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Local Predictor   Subthreshold Leakage = " << L1_localBPT->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Local Predictor   Gate Leakage = " << L1_localBPT->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Chooser   Peak Dynamic = " << chooser->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Chooser   Subthreshold Leakage = " << chooser->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Chooser   Gate Leakage = " << chooser->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "RAS   Peak Dynamic = " << RAS->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "RAS   Subthreshold Leakage = " << RAS->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "RAS   Gate Leakage = " << RAS->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+
+}
+
+void InstFetchU::computeEnergy(bool is_tdp)
+{
+        if (!exist) return;
+        if (is_tdp)
+    {
+                //init stats for Peak
+        icache.caches->stats_t.readAc.access  = icache.caches->l_ip.num_rw_ports*coredynp.IFU_duty_cycle;
+        icache.caches->stats_t.readAc.miss    = 0;
+        icache.caches->stats_t.readAc.hit     = icache.caches->stats_t.readAc.access - icache.caches->stats_t.readAc.miss;
+        icache.caches->tdp_stats = icache.caches->stats_t;
+
+        icache.missb->stats_t.readAc.access  = icache.missb->stats_t.readAc.hit=  icache.missb->l_ip.num_search_ports;
+        icache.missb->stats_t.writeAc.access = icache.missb->stats_t.writeAc.hit= icache.missb->l_ip.num_search_ports;
+        icache.missb->tdp_stats = icache.missb->stats_t;
+
+        icache.ifb->stats_t.readAc.access  = icache.ifb->stats_t.readAc.hit=  icache.ifb->l_ip.num_search_ports;
+        icache.ifb->stats_t.writeAc.access = icache.ifb->stats_t.writeAc.hit= icache.ifb->l_ip.num_search_ports;
+        icache.ifb->tdp_stats = icache.ifb->stats_t;
+
+        icache.prefetchb->stats_t.readAc.access  = icache.prefetchb->stats_t.readAc.hit= icache.prefetchb->l_ip.num_search_ports;
+        icache.prefetchb->stats_t.writeAc.access = icache.ifb->stats_t.writeAc.hit= icache.ifb->l_ip.num_search_ports;
+        icache.prefetchb->tdp_stats = icache.prefetchb->stats_t;
+
+        IB->stats_t.readAc.access = IB->stats_t.writeAc.access = XML->sys.core[ithCore].peak_issue_width;
+        IB->tdp_stats = IB->stats_t;
+
+        if (coredynp.predictionW>0)
+        {
+                BTB->stats_t.readAc.access  = coredynp.predictionW;//XML->sys.core[ithCore].BTB.read_accesses;
+                BTB->stats_t.writeAc.access = 0;//XML->sys.core[ithCore].BTB.write_accesses;
+        }
+
+        ID_inst->stats_t.readAc.access     = coredynp.decodeW;
+        ID_operand->stats_t.readAc.access  = coredynp.decodeW;
+        ID_misc->stats_t.readAc.access     = coredynp.decodeW;
+        ID_inst->tdp_stats = ID_inst->stats_t;
+        ID_operand->tdp_stats = ID_operand->stats_t;
+        ID_misc->tdp_stats = ID_misc->stats_t;
+
+
+    }
+    else
+    {
+        //init stats for Runtime Dynamic (RTP)
+        icache.caches->stats_t.readAc.access  = XML->sys.core[ithCore].icache.read_accesses;
+        icache.caches->stats_t.readAc.miss    = XML->sys.core[ithCore].icache.read_misses;
+        icache.caches->stats_t.readAc.hit     = icache.caches->stats_t.readAc.access - icache.caches->stats_t.readAc.miss;
+        icache.caches->rtp_stats = icache.caches->stats_t;
+
+        icache.missb->stats_t.readAc.access  = icache.caches->stats_t.readAc.miss;
+        icache.missb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss;
+        icache.missb->rtp_stats = icache.missb->stats_t;
+
+        icache.ifb->stats_t.readAc.access  = icache.caches->stats_t.readAc.miss;
+        icache.ifb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss;
+        icache.ifb->rtp_stats = icache.ifb->stats_t;
+
+        icache.prefetchb->stats_t.readAc.access  = icache.caches->stats_t.readAc.miss;
+        icache.prefetchb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss;
+        icache.prefetchb->rtp_stats = icache.prefetchb->stats_t;
+
+        IB->stats_t.readAc.access = IB->stats_t.writeAc.access = XML->sys.core[ithCore].total_instructions;
+        IB->rtp_stats = IB->stats_t;
+
+        if (coredynp.predictionW>0)
+        {
+                BTB->stats_t.readAc.access  = XML->sys.core[ithCore].BTB.read_accesses;//XML->sys.core[ithCore].branch_instructions;
+                BTB->stats_t.writeAc.access = XML->sys.core[ithCore].BTB.write_accesses;//XML->sys.core[ithCore].branch_mispredictions;
+                BTB->rtp_stats = BTB->stats_t;
+        }
+
+        ID_inst->stats_t.readAc.access     = XML->sys.core[ithCore].total_instructions;
+        ID_operand->stats_t.readAc.access  = XML->sys.core[ithCore].total_instructions;
+        ID_misc->stats_t.readAc.access     = XML->sys.core[ithCore].total_instructions;
+        ID_inst->rtp_stats = ID_inst->stats_t;
+        ID_operand->rtp_stats = ID_operand->stats_t;
+        ID_misc->rtp_stats = ID_misc->stats_t;
+
+    }
+
+    icache.power_t.reset();
+    IB->power_t.reset();
+//     ID_inst->power_t.reset();
+//     ID_operand->power_t.reset();
+//     ID_misc->power_t.reset();
+    if (coredynp.predictionW>0)
+    {
+        BTB->power_t.reset();
+    }
+
+    icache.power_t.readOp.dynamic      += (icache.caches->stats_t.readAc.hit*icache.caches->local_result.power.readOp.dynamic+
+                //icache.caches->stats_t.readAc.miss*icache.caches->local_result.tag_array2->power.readOp.dynamic+
+                icache.caches->stats_t.readAc.miss*icache.caches->local_result.power.readOp.dynamic+ //assume tag data accessed in parallel
+                icache.caches->stats_t.readAc.miss*icache.caches->local_result.power.writeOp.dynamic); //read miss in Icache cause a write to Icache
+    icache.power_t.readOp.dynamic      +=  icache.missb->stats_t.readAc.access*icache.missb->local_result.power.searchOp.dynamic +
+            icache.missb->stats_t.writeAc.access*icache.missb->local_result.power.writeOp.dynamic;//each access to missb involves a CAM and a write
+    icache.power_t.readOp.dynamic      +=  icache.ifb->stats_t.readAc.access*icache.ifb->local_result.power.searchOp.dynamic +
+            icache.ifb->stats_t.writeAc.access*icache.ifb->local_result.power.writeOp.dynamic;
+    icache.power_t.readOp.dynamic      +=  icache.prefetchb->stats_t.readAc.access*icache.prefetchb->local_result.power.searchOp.dynamic +
+            icache.prefetchb->stats_t.writeAc.access*icache.prefetchb->local_result.power.writeOp.dynamic;
+
+        IB->power_t.readOp.dynamic   +=  IB->local_result.power.readOp.dynamic*IB->stats_t.readAc.access +
+                        IB->stats_t.writeAc.access*IB->local_result.power.writeOp.dynamic;
+
+        if (coredynp.predictionW>0)
+        {
+                BTB->power_t.readOp.dynamic   +=  BTB->local_result.power.readOp.dynamic*BTB->stats_t.readAc.access +
+                BTB->stats_t.writeAc.access*BTB->local_result.power.writeOp.dynamic;
+
+                BPT->computeEnergy(is_tdp);
+        }
+
+    if (is_tdp)
+    {
+//     icache.power = icache.power_t +
+//             (icache.caches->local_result.power)*pppm_lkg +
+//                     (icache.missb->local_result.power +
+//                     icache.ifb->local_result.power +
+//                     icache.prefetchb->local_result.power)*pppm_Isub;
+        icache.power = icache.power_t +
+                (icache.caches->local_result.power +
+                        icache.missb->local_result.power +
+                        icache.ifb->local_result.power +
+                        icache.prefetchb->local_result.power)*pppm_lkg;
+
+        IB->power = IB->power_t + IB->local_result.power*pppm_lkg;
+        power     = power + icache.power + IB->power;
+        if (coredynp.predictionW>0)
+        {
+                BTB->power = BTB->power_t + BTB->local_result.power*pppm_lkg;
+                power     = power  + BTB->power + BPT->power;
+        }
+
+        ID_inst->power_t.readOp.dynamic    = ID_inst->power.readOp.dynamic;
+        ID_operand->power_t.readOp.dynamic = ID_operand->power.readOp.dynamic;
+        ID_misc->power_t.readOp.dynamic    = ID_misc->power.readOp.dynamic;
+
+        ID_inst->power.readOp.dynamic    *= ID_inst->tdp_stats.readAc.access;
+        ID_operand->power.readOp.dynamic *= ID_operand->tdp_stats.readAc.access;
+        ID_misc->power.readOp.dynamic    *= ID_misc->tdp_stats.readAc.access;
+
+        power = power + (ID_inst->power +
+                                                        ID_operand->power +
+                                                        ID_misc->power);
+    }
+    else
+    {
+//     icache.rt_power = icache.power_t +
+//             (icache.caches->local_result.power)*pppm_lkg +
+//                     (icache.missb->local_result.power +
+//                     icache.ifb->local_result.power +
+//                     icache.prefetchb->local_result.power)*pppm_Isub;
+
+        icache.rt_power = icache.power_t +
+                (icache.caches->local_result.power +
+                        icache.missb->local_result.power +
+                        icache.ifb->local_result.power +
+                        icache.prefetchb->local_result.power)*pppm_lkg;
+
+        IB->rt_power = IB->power_t + IB->local_result.power*pppm_lkg;
+        rt_power     = rt_power + icache.rt_power + IB->rt_power;
+        if (coredynp.predictionW>0)
+        {
+                BTB->rt_power = BTB->power_t + BTB->local_result.power*pppm_lkg;
+                rt_power     = rt_power + BTB->rt_power + BPT->rt_power;
+        }
+
+        ID_inst->rt_power.readOp.dynamic    = ID_inst->power_t.readOp.dynamic*ID_inst->rtp_stats.readAc.access;
+        ID_operand->rt_power.readOp.dynamic = ID_operand->power_t.readOp.dynamic * ID_operand->rtp_stats.readAc.access;
+        ID_misc->rt_power.readOp.dynamic    = ID_misc->power_t.readOp.dynamic * ID_misc->rtp_stats.readAc.access;
+
+        rt_power = rt_power + (ID_inst->rt_power +
+                                                        ID_operand->rt_power +
+                                                        ID_misc->rt_power);
+    }
+}
+
+void InstFetchU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+
+        if (is_tdp)
+        {
+
+                cout << indent_str<< "Instruction Cache:" << endl;
+                cout << indent_str_next << "Area = " << icache.area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << icache.power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? icache.power.readOp.longer_channel_leakage:icache.power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << icache.power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << icache.rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                if (coredynp.predictionW>0)
+                {
+                        cout << indent_str<< "Branch Target Buffer:" << endl;
+                        cout << indent_str_next << "Area = " << BTB->area.get_area() *1e-6 << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << BTB->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? BTB->power.readOp.longer_channel_leakage:BTB->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << BTB->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << BTB->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        if (BPT->exist)
+                        {
+                                cout << indent_str<< "Branch Predictor:" << endl;
+                                cout << indent_str_next << "Area = " << BPT->area.get_area()  *1e-6<< " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << BPT->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                        << (long_channel? BPT->power.readOp.longer_channel_leakage:BPT->power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << BPT->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << BPT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                                if (plevel>3)
+                                {
+                                        BPT->displayEnergy(indent+4, plevel, is_tdp);
+                                }
+                        }
+                }
+                cout << indent_str<< "Instruction Buffer:" << endl;
+                cout << indent_str_next << "Area = " << IB->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << IB->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                << (long_channel? IB->power.readOp.longer_channel_leakage:IB->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << IB->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << IB->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str<< "Instruction Decoder:" << endl;
+                cout << indent_str_next << "Area = " << (ID_inst->area.get_area() +
+                                ID_operand->area.get_area() +
+                                ID_misc->area.get_area())*coredynp.decodeW*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << (ID_inst->power.readOp.dynamic +
+                                ID_operand->power.readOp.dynamic +
+                                ID_misc->power.readOp.dynamic)*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                << (long_channel? (ID_inst->power.readOp.longer_channel_leakage +
+                                ID_operand->power.readOp.longer_channel_leakage +
+                                ID_misc->power.readOp.longer_channel_leakage):
+                                        (ID_inst->power.readOp.leakage +
+                                                        ID_operand->power.readOp.leakage +
+                                                        ID_misc->power.readOp.leakage))  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << (ID_inst->power.readOp.gate_leakage +
+                                ID_operand->power.readOp.gate_leakage +
+                                ID_misc->power.readOp.gate_leakage)  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << (ID_inst->rt_power.readOp.dynamic +
+                                ID_operand->rt_power.readOp.dynamic +
+                                ID_misc->rt_power.readOp.dynamic)/executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+//             cout << indent_str_next << "Instruction Cache    Peak Dynamic = " << icache.rt_power.readOp.dynamic*clockRate << " W" << endl;
+//             cout << indent_str_next << "Instruction Cache    Subthreshold Leakage = " << icache.rt_power.readOp.leakage <<" W" << endl;
+//             cout << indent_str_next << "Instruction Cache    Gate Leakage = " << icache.rt_power.readOp.gate_leakage << " W" << endl;
+//             cout << indent_str_next << "Instruction Buffer   Peak Dynamic = " << IB->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Instruction Buffer   Subthreshold Leakage = " << IB->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Instruction Buffer   Gate Leakage = " << IB->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Branch Target Buffer   Peak Dynamic = " << BTB->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Branch Target Buffer   Subthreshold Leakage = " << BTB->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Branch Target Buffer   Gate Leakage = " << BTB->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Branch Predictor   Peak Dynamic = " << BPT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Branch Predictor   Subthreshold Leakage = " << BPT->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Branch Predictor   Gate Leakage = " << BPT->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+
+}
+
+void RENAMINGU::computeEnergy(bool is_tdp)
+{
+        if (!exist) return;
+        double pppm_t[4]    = {1,1,1,1};
+        if (is_tdp)
+        {//init stats for Peak
+                if (coredynp.core_ty==OOO){
+                        if (coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                if (coredynp.rm_ty ==RAMbased)
+                                {
+                                        iFRAT->stats_t.readAc.access   = iFRAT->l_ip.num_rd_ports;
+                                        iFRAT->stats_t.writeAc.access  = iFRAT->l_ip.num_wr_ports;
+                                        iFRAT->tdp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = fFRAT->l_ip.num_rd_ports;
+                                        fFRAT->stats_t.writeAc.access  = fFRAT->l_ip.num_wr_ports;
+                                        fFRAT->tdp_stats = fFRAT->stats_t;
+
+                                }
+                                else if ((coredynp.rm_ty ==CAMbased))
+                                {
+                                        iFRAT->stats_t.readAc.access   = iFRAT->l_ip.num_search_ports;
+                                        iFRAT->stats_t.writeAc.access  = iFRAT->l_ip.num_wr_ports;
+                                        iFRAT->tdp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = fFRAT->l_ip.num_search_ports;
+                                        fFRAT->stats_t.writeAc.access  = fFRAT->l_ip.num_wr_ports;
+                                        fFRAT->tdp_stats = fFRAT->stats_t;
+                                }
+
+                                iRRAT->stats_t.readAc.access   = iRRAT->l_ip.num_rd_ports;
+                                iRRAT->stats_t.writeAc.access  = iRRAT->l_ip.num_wr_ports;
+                                iRRAT->tdp_stats = iRRAT->stats_t;
+
+                                fRRAT->stats_t.readAc.access   = fRRAT->l_ip.num_rd_ports;
+                                fRRAT->stats_t.writeAc.access  = fRRAT->l_ip.num_wr_ports;
+                                fRRAT->tdp_stats = fRRAT->stats_t;
+
+                                ifreeL->stats_t.readAc.access   = coredynp.decodeW;//ifreeL->l_ip.num_rd_ports;;
+                                ifreeL->stats_t.writeAc.access  = coredynp.decodeW;//ifreeL->l_ip.num_wr_ports;
+                                ifreeL->tdp_stats = ifreeL->stats_t;
+
+                                ffreeL->stats_t.readAc.access   = coredynp.decodeW;//ffreeL->l_ip.num_rd_ports;
+                                ffreeL->stats_t.writeAc.access  = coredynp.decodeW;//ffreeL->l_ip.num_wr_ports;
+                                ffreeL->tdp_stats = ffreeL->stats_t;
+                        }
+                        else if (coredynp.scheu_ty==ReservationStation){
+                                if (coredynp.rm_ty ==RAMbased)
+                                {
+                                        iFRAT->stats_t.readAc.access    = iFRAT->l_ip.num_rd_ports;
+                                        iFRAT->stats_t.writeAc.access   = iFRAT->l_ip.num_wr_ports;
+                                        iFRAT->stats_t.searchAc.access  = iFRAT->l_ip.num_search_ports;
+                                        iFRAT->tdp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access    = fFRAT->l_ip.num_rd_ports;
+                                        fFRAT->stats_t.writeAc.access   = fFRAT->l_ip.num_wr_ports;
+                                        fFRAT->stats_t.searchAc.access  = fFRAT->l_ip.num_search_ports;
+                                        fFRAT->tdp_stats = fFRAT->stats_t;
+
+                                }
+                                else if ((coredynp.rm_ty ==CAMbased))
+                                {
+                                        iFRAT->stats_t.readAc.access   = iFRAT->l_ip.num_search_ports;
+                                        iFRAT->stats_t.writeAc.access  = iFRAT->l_ip.num_wr_ports;
+                                        iFRAT->tdp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = fFRAT->l_ip.num_search_ports;
+                                        fFRAT->stats_t.writeAc.access  = fFRAT->l_ip.num_wr_ports;
+                                        fFRAT->tdp_stats = fFRAT->stats_t;
+                                }
+                                //Unified free list for both int and fp
+                                ifreeL->stats_t.readAc.access   = coredynp.decodeW;//ifreeL->l_ip.num_rd_ports;
+                                ifreeL->stats_t.writeAc.access  = coredynp.decodeW;//ifreeL->l_ip.num_wr_ports;
+                                ifreeL->tdp_stats = ifreeL->stats_t;
+                        }
+                        idcl->stats_t.readAc.access = coredynp.decodeW;
+                        fdcl->stats_t.readAc.access = coredynp.decodeW;
+                        idcl->tdp_stats = idcl->stats_t;
+                        fdcl->tdp_stats = fdcl->stats_t;
+                }
+                else
+                {
+                        if (coredynp.issueW>1)
+                        {
+                                idcl->stats_t.readAc.access = coredynp.decodeW;
+                                fdcl->stats_t.readAc.access = coredynp.decodeW;
+                                idcl->tdp_stats = idcl->stats_t;
+                                fdcl->tdp_stats = fdcl->stats_t;
+                        }
+                }
+
+        }
+        else
+        {//init stats for Runtime Dynamic (RTP)
+                if (coredynp.core_ty==OOO){
+                        if (coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                if (coredynp.rm_ty ==RAMbased)
+                                {
+                                        iFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].rename_reads;
+                                        iFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].rename_writes;
+                                        iFRAT->rtp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].fp_rename_reads;
+                                        fFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].fp_rename_writes;
+                                        fFRAT->rtp_stats = fFRAT->stats_t;
+                                }
+                                else if ((coredynp.rm_ty ==CAMbased))
+                                {
+                                        iFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].rename_reads;
+                                        iFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].rename_writes;
+                                        iFRAT->rtp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].fp_rename_reads;
+                                        fFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].fp_rename_writes;
+                                        fFRAT->rtp_stats = fFRAT->stats_t;
+                                }
+
+                                iRRAT->stats_t.readAc.access   = XML->sys.core[ithCore].rename_writes;//Hack, should be (context switch + branch mispredictions)*16
+                                iRRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].rename_writes;
+                                iRRAT->rtp_stats = iRRAT->stats_t;
+
+                                fRRAT->stats_t.readAc.access   = XML->sys.core[ithCore].fp_rename_writes;//Hack, should be (context switch + branch mispredictions)*16
+                                fRRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].fp_rename_writes;
+                                fRRAT->rtp_stats = fRRAT->stats_t;
+
+                                ifreeL->stats_t.readAc.access   = XML->sys.core[ithCore].rename_reads;
+                                ifreeL->stats_t.writeAc.access  = 2*XML->sys.core[ithCore].rename_writes;
+                                ifreeL->rtp_stats = ifreeL->stats_t;
+
+                                ffreeL->stats_t.readAc.access   = XML->sys.core[ithCore].fp_rename_reads;
+                                ffreeL->stats_t.writeAc.access  = 2*XML->sys.core[ithCore].fp_rename_writes;
+                                ffreeL->rtp_stats = ffreeL->stats_t;
+                        }
+                        else if (coredynp.scheu_ty==ReservationStation){
+                                if (coredynp.rm_ty ==RAMbased)
+                                {
+                                        iFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].rename_reads;
+                                        iFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].rename_writes;
+                                        iFRAT->stats_t.searchAc.access  = XML->sys.core[ithCore].committed_int_instructions;//hack: not all committed instructions use regs.
+                                        iFRAT->rtp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].fp_rename_reads;
+                                        fFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].fp_rename_writes;
+                                        fFRAT->stats_t.searchAc.access  = XML->sys.core[ithCore].committed_fp_instructions;
+                                        fFRAT->rtp_stats = fFRAT->stats_t;
+                                }
+                                else if ((coredynp.rm_ty ==CAMbased))
+                                {
+                                        iFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].rename_reads;
+                                        iFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].rename_writes;
+                                        iFRAT->rtp_stats = iFRAT->stats_t;
+
+                                        fFRAT->stats_t.readAc.access   = XML->sys.core[ithCore].fp_rename_reads;
+                                        fFRAT->stats_t.writeAc.access  = XML->sys.core[ithCore].fp_rename_writes;
+                                        fFRAT->rtp_stats = fFRAT->stats_t;
+                                }
+                                //Unified free list for both int and fp since the ROB act as physcial registers
+                                ifreeL->stats_t.readAc.access   = XML->sys.core[ithCore].rename_reads +
+                                        XML->sys.core[ithCore].fp_rename_reads;
+                                ifreeL->stats_t.writeAc.access  = 2*(XML->sys.core[ithCore].rename_writes +
+                                        XML->sys.core[ithCore].fp_rename_writes);//HACK: 2-> since some of renaming in the same group
+                                                                                                                         //are terminated early
+                                ifreeL->rtp_stats = ifreeL->stats_t;
+                        }
+                        idcl->stats_t.readAc.access = 3*coredynp.decodeW*coredynp.decodeW*XML->sys.core[ithCore].rename_reads;
+                        fdcl->stats_t.readAc.access = 3*coredynp.fp_issueW*coredynp.fp_issueW*XML->sys.core[ithCore].fp_rename_writes;
+                        idcl->rtp_stats = idcl->stats_t;
+                        fdcl->rtp_stats = fdcl->stats_t;
+                }
+                else
+                {
+                        if (coredynp.issueW>1)
+                        {
+                                idcl->stats_t.readAc.access = 2*XML->sys.core[ithCore].int_instructions;
+                                fdcl->stats_t.readAc.access = XML->sys.core[ithCore].fp_instructions;
+                                idcl->rtp_stats = idcl->stats_t;
+                                fdcl->rtp_stats = fdcl->stats_t;
+                        }
+                }
+
+        }
+    /* Compute engine */
+        if (coredynp.core_ty==OOO)
+        {
+                if (coredynp.scheu_ty==PhysicalRegFile)
+                {
+                        if (coredynp.rm_ty ==RAMbased)
+                        {
+                                iFRAT->power_t.reset();
+                                fFRAT->power_t.reset();
+
+                                iFRAT->power_t.readOp.dynamic  +=  (iFRAT->stats_t.readAc.access
+                                                *(iFRAT->local_result.power.readOp.dynamic + idcl->power.readOp.dynamic)
+                                                +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic);
+                                fFRAT->power_t.readOp.dynamic  +=  (fFRAT->stats_t.readAc.access
+                                                *(fFRAT->local_result.power.readOp.dynamic + fdcl->power.readOp.dynamic)
+                                                +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic);
+                        }
+                        else if ((coredynp.rm_ty ==CAMbased))
+                        {
+                                iFRAT->power_t.reset();
+                                fFRAT->power_t.reset();
+                                iFRAT->power_t.readOp.dynamic  +=  (iFRAT->stats_t.readAc.access
+                                                *(iFRAT->local_result.power.searchOp.dynamic + idcl->power.readOp.dynamic)
+                                                +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic);
+                                fFRAT->power_t.readOp.dynamic  +=  (fFRAT->stats_t.readAc.access
+                                                *(fFRAT->local_result.power.searchOp.dynamic + fdcl->power.readOp.dynamic)
+                                                +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic);
+                        }
+
+                        iRRAT->power_t.reset();
+                        fRRAT->power_t.reset();
+                        ifreeL->power_t.reset();
+                        ffreeL->power_t.reset();
+
+                        iRRAT->power_t.readOp.dynamic  +=  (iRRAT->stats_t.readAc.access*iRRAT->local_result.power.readOp.dynamic
+                                        +iRRAT->stats_t.writeAc.access*iRRAT->local_result.power.writeOp.dynamic);
+                        fRRAT->power_t.readOp.dynamic  +=  (fRRAT->stats_t.readAc.access*fRRAT->local_result.power.readOp.dynamic
+                                        +fRRAT->stats_t.writeAc.access*fRRAT->local_result.power.writeOp.dynamic);
+                        ifreeL->power_t.readOp.dynamic  +=  (ifreeL->stats_t.readAc.access*ifreeL->local_result.power.readOp.dynamic
+                                        +ifreeL->stats_t.writeAc.access*ifreeL->local_result.power.writeOp.dynamic);
+                        ffreeL->power_t.readOp.dynamic  +=  (ffreeL->stats_t.readAc.access*ffreeL->local_result.power.readOp.dynamic
+                                        +ffreeL->stats_t.writeAc.access*ffreeL->local_result.power.writeOp.dynamic);
+
+                }
+                else if (coredynp.scheu_ty==ReservationStation)
+                {
+                        if (coredynp.rm_ty ==RAMbased)
+                        {
+                                iFRAT->power_t.reset();
+                                fFRAT->power_t.reset();
+
+                                iFRAT->power_t.readOp.dynamic  +=  (iFRAT->stats_t.readAc.access
+                                                *(iFRAT->local_result.power.readOp.dynamic + idcl->power.readOp.dynamic)
+                                                +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic
+                                                +iFRAT->stats_t.searchAc.access*iFRAT->local_result.power.searchOp.dynamic);
+                                fFRAT->power_t.readOp.dynamic  +=  (fFRAT->stats_t.readAc.access
+                                                *(fFRAT->local_result.power.readOp.dynamic + fdcl->power.readOp.dynamic)
+                                                +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic
+                                                +fFRAT->stats_t.searchAc.access*fFRAT->local_result.power.searchOp.dynamic);
+                        }
+                        else if ((coredynp.rm_ty ==CAMbased))
+                        {
+                                iFRAT->power_t.reset();
+                                fFRAT->power_t.reset();
+                                iFRAT->power_t.readOp.dynamic  +=  (iFRAT->stats_t.readAc.access
+                                                *(iFRAT->local_result.power.searchOp.dynamic + idcl->power.readOp.dynamic)
+                                                +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic);
+                                fFRAT->power_t.readOp.dynamic  +=  (fFRAT->stats_t.readAc.access
+                                                *(fFRAT->local_result.power.searchOp.dynamic + fdcl->power.readOp.dynamic)
+                                                +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic);
+                        }
+                        ifreeL->power_t.reset();
+                        ifreeL->power_t.readOp.dynamic  +=  (ifreeL->stats_t.readAc.access*ifreeL->local_result.power.readOp.dynamic
+                                        +ifreeL->stats_t.writeAc.access*ifreeL->local_result.power.writeOp.dynamic);
+                }
+
+        }
+        else
+        {
+                if (coredynp.issueW>1)
+                {
+                        idcl->power_t.reset();
+                        fdcl->power_t.reset();
+                        set_pppm(pppm_t, idcl->stats_t.readAc.access, coredynp.num_hthreads, coredynp.num_hthreads, idcl->stats_t.readAc.access);
+                        idcl->power_t = idcl->power * pppm_t;
+                        set_pppm(pppm_t, fdcl->stats_t.readAc.access, coredynp.num_hthreads, coredynp.num_hthreads, idcl->stats_t.readAc.access);
+                        fdcl->power_t = fdcl->power * pppm_t;
+                }
+
+        }
+
+        //assign value to tpd and rtp
+        if (is_tdp)
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        if (coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                iFRAT->power   =  iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t;
+                                fFRAT->power   =  fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t;
+                                iRRAT->power   =  iRRAT->power_t + iRRAT->local_result.power * coredynp.pppm_lkg_multhread;
+                                fRRAT->power   =  fRRAT->power_t + fRRAT->local_result.power * coredynp.pppm_lkg_multhread;
+                                ifreeL->power  =  ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread;
+                                ffreeL->power  =  ffreeL->power_t + ffreeL->local_result.power * coredynp.pppm_lkg_multhread;
+                                power         =  power + (iFRAT->power + fFRAT->power)
+                                                 + (iRRAT->power + fRRAT->power)
+                                                 + (ifreeL->power + ffreeL->power);
+                        }
+                        else if (coredynp.scheu_ty==ReservationStation)
+                        {
+                                iFRAT->power   =  iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t;
+                                fFRAT->power   =  fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t;
+                                ifreeL->power  =  ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread;
+                                power         =  power + (iFRAT->power + fFRAT->power)
+                                                 + ifreeL->power;
+                        }
+                }
+                else
+                {
+                        power   =  power + idcl->power_t + fdcl->power_t;
+                }
+
+        }
+        else
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        if (coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                iFRAT->rt_power   =  iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t;
+                                fFRAT->rt_power   =  fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t;
+                                iRRAT->rt_power   =  iRRAT->power_t + iRRAT->local_result.power * coredynp.pppm_lkg_multhread;
+                                fRRAT->rt_power   =  fRRAT->power_t + fRRAT->local_result.power * coredynp.pppm_lkg_multhread;
+                                ifreeL->rt_power  =  ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread;
+                                ffreeL->rt_power  =  ffreeL->power_t + ffreeL->local_result.power * coredynp.pppm_lkg_multhread;
+                                rt_power             =  rt_power + (iFRAT->rt_power + fFRAT->rt_power)
+                                                   + (iRRAT->rt_power + fRRAT->rt_power)
+                                                   + (ifreeL->rt_power + ffreeL->rt_power);
+                        }
+                        else if (coredynp.scheu_ty==ReservationStation)
+                        {
+                                iFRAT->rt_power   =  iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t;
+                                fFRAT->rt_power   =  fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t;
+                                ifreeL->rt_power  =  ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread;
+                                rt_power             =  rt_power + (iFRAT->rt_power + fFRAT->rt_power)
+                                                   + ifreeL->rt_power;
+                        }
+                }
+                else
+                {
+                        rt_power   =  rt_power + idcl->power_t + fdcl->power_t;
+                }
+
+        }
+}
+
+void RENAMINGU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+
+        if (is_tdp)
+        {
+
+                if (coredynp.core_ty==OOO)
+                {
+                        cout << indent_str<< "Int Front End RAT:" << endl;
+                        cout << indent_str_next << "Area = " << iFRAT->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << iFRAT->power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? iFRAT->power.readOp.longer_channel_leakage:iFRAT->power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << iFRAT->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << iFRAT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        cout << indent_str<< "FP Front End RAT:" << endl;
+                        cout << indent_str_next << "Area = " << fFRAT->area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << fFRAT->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? fFRAT->power.readOp.longer_channel_leakage:fFRAT->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << fFRAT->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << fFRAT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        cout << indent_str<<"Free List:" << endl;
+                        cout << indent_str_next << "Area = " << ifreeL->area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << ifreeL->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? ifreeL->power.readOp.longer_channel_leakage:ifreeL->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << ifreeL->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << ifreeL->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+
+                        if (coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                cout << indent_str<< "Int Retire RAT: " << endl;
+                                cout << indent_str_next << "Area = " << iRRAT->area.get_area() *1e-6 << " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << iRRAT->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                        << (long_channel? iRRAT->power.readOp.longer_channel_leakage:iRRAT->power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << iRRAT->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << iRRAT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                                cout << indent_str<< "FP Retire RAT:" << endl;
+                                cout << indent_str_next << "Area = " << fRRAT->area.get_area()  *1e-6<< " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << fRRAT->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                        << (long_channel? fRRAT->power.readOp.longer_channel_leakage:fRRAT->power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << fRRAT->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << fRRAT->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                                cout << indent_str<< "FP Free List:" << endl;
+                                cout << indent_str_next << "Area = " << ffreeL->area.get_area()*1e-6  << " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << ffreeL->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                        << (long_channel? ffreeL->power.readOp.longer_channel_leakage:ffreeL->power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << ffreeL->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << ffreeL->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                        }
+                }
+                else
+                {
+                        cout << indent_str<< "Int DCL:" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << idcl->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? idcl->power.readOp.longer_channel_leakage:idcl->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << idcl->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << idcl->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout << indent_str<<"FP DCL:" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << fdcl->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? fdcl->power.readOp.longer_channel_leakage:fdcl->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << fdcl->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << fdcl->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                }
+        }
+        else
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        cout << indent_str_next << "Int Front End RAT    Peak Dynamic = " << iFRAT->rt_power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Int Front End RAT    Subthreshold Leakage = " << iFRAT->rt_power.readOp.leakage <<" W" << endl;
+                        cout << indent_str_next << "Int Front End RAT    Gate Leakage = " << iFRAT->rt_power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "FP Front End RAT   Peak Dynamic = " << fFRAT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "FP Front End RAT   Subthreshold Leakage = " << fFRAT->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "FP Front End RAT   Gate Leakage = " << fFRAT->rt_power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Free List   Peak Dynamic = " << ifreeL->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Free List   Subthreshold Leakage = " << ifreeL->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "Free List   Gate Leakage = " << fFRAT->rt_power.readOp.gate_leakage  << " W" << endl;
+                        if (coredynp.scheu_ty==PhysicalRegFile)
+                        {
+                                cout << indent_str_next << "Int Retire RAT   Peak Dynamic = " << iRRAT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Int Retire RAT   Subthreshold Leakage = " << iRRAT->rt_power.readOp.leakage  << " W" << endl;
+                                cout << indent_str_next << "Int Retire RAT   Gate Leakage = " << iRRAT->rt_power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "FP Retire RAT   Peak Dynamic = " << fRRAT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "FP Retire RAT   Subthreshold Leakage = " << fRRAT->rt_power.readOp.leakage  << " W" << endl;
+                                cout << indent_str_next << "FP Retire RAT   Gate Leakage = " << fRRAT->rt_power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "FP Free List   Peak Dynamic = " << ffreeL->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "FP Free List   Subthreshold Leakage = " << ffreeL->rt_power.readOp.leakage  << " W" << endl;
+                                cout << indent_str_next << "FP Free List   Gate Leakage = " << fFRAT->rt_power.readOp.gate_leakage  << " W" << endl;
+                        }
+                }
+                else
+                {
+                        cout << indent_str_next << "Int DCL   Peak Dynamic = " << idcl->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Int DCL   Subthreshold Leakage = " << idcl->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "Int DCL   Gate Leakage = " << idcl->rt_power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "FP DCL   Peak Dynamic = " << fdcl->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "FP DCL   Subthreshold Leakage = " << fdcl->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "FP DCL   Gate Leakage = " << fdcl->rt_power.readOp.gate_leakage  << " W" << endl;
+                }
+        }
+
+}
+
+
+void SchedulerU::computeEnergy(bool is_tdp)
+{
+        if (!exist) return;
+        double ROB_duty_cycle;
+//     ROB_duty_cycle = ((coredynp.ALU_duty_cycle + coredynp.num_muls>0?coredynp.MUL_duty_cycle:0
+//                     + coredynp.num_fpus>0?coredynp.FPU_duty_cycle:0))*1.1<1 ? (coredynp.ALU_duty_cycle + coredynp.num_muls>0?coredynp.MUL_duty_cycle:0
+//                                     + coredynp.num_fpus>0?coredynp.FPU_duty_cycle:0)*1.1:1;
+        ROB_duty_cycle = 1;
+        //init stats
+        if (is_tdp)
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        int_inst_window->stats_t.readAc.access    = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_search_ports;
+                        int_inst_window->stats_t.writeAc.access   = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_wr_ports;
+                        int_inst_window->stats_t.searchAc.access  = coredynp.issueW*coredynp.num_pipelines;
+                        int_inst_window->tdp_stats                = int_inst_window->stats_t;
+                        fp_inst_window->stats_t.readAc.access     = fp_inst_window->l_ip.num_rd_ports*coredynp.num_fp_pipelines;
+                        fp_inst_window->stats_t.writeAc.access    = fp_inst_window->l_ip.num_wr_ports*coredynp.num_fp_pipelines;
+                        fp_inst_window->stats_t.searchAc.access   = fp_inst_window->l_ip.num_search_ports*coredynp.num_fp_pipelines;
+                        fp_inst_window->tdp_stats                 = fp_inst_window->stats_t;
+
+                        if (XML->sys.core[ithCore].ROB_size >0)
+                        {
+                                ROB->stats_t.readAc.access   = coredynp.commitW*coredynp.num_pipelines*ROB_duty_cycle;
+                                ROB->stats_t.writeAc.access  = coredynp.issueW*coredynp.num_pipelines*ROB_duty_cycle;
+                                ROB->tdp_stats        = ROB->stats_t;
+
+                                /*
+                                 * When inst commits, ROB must be read.
+                                 * Because for Physcial register based cores, physical register tag in ROB
+                                 * need to be read out and write into RRAT/CAM based RAT.
+                                 * For RS based cores, register content that stored in ROB must be
+                                 * read out and stored in architectural registers.
+                                 *
+                                 * if no-register is involved, the ROB read out operation when instruction commits can be ignored.
+                                 * assuming 20% insts. belong this type.
+                                 * TODO: ROB duty_cycle need to be revisited
+                                 */
+                        }
+
+                }
+                else if (coredynp.multithreaded)
+                {
+                        int_inst_window->stats_t.readAc.access   = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_search_ports;
+                        int_inst_window->stats_t.writeAc.access  = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_wr_ports;
+                        int_inst_window->stats_t.searchAc.access = coredynp.issueW*coredynp.num_pipelines;
+                        int_inst_window->tdp_stats       = int_inst_window->stats_t;
+                }
+
+     }
+    else
+    {//rtp
+                if (coredynp.core_ty==OOO)
+                {
+                        int_inst_window->stats_t.readAc.access   = XML->sys.core[ithCore].inst_window_reads;
+                        int_inst_window->stats_t.writeAc.access  = XML->sys.core[ithCore].inst_window_writes;
+                        int_inst_window->stats_t.searchAc.access = XML->sys.core[ithCore].inst_window_wakeup_accesses;
+                        int_inst_window->rtp_stats               = int_inst_window->stats_t;
+                        fp_inst_window->stats_t.readAc.access    = XML->sys.core[ithCore].fp_inst_window_reads;
+                        fp_inst_window->stats_t.writeAc.access   = XML->sys.core[ithCore].fp_inst_window_writes;
+                        fp_inst_window->stats_t.searchAc.access  = XML->sys.core[ithCore].fp_inst_window_wakeup_accesses;
+                        fp_inst_window->rtp_stats                = fp_inst_window->stats_t;
+
+                        if (XML->sys.core[ithCore].ROB_size >0)
+                        {
+
+                                ROB->stats_t.readAc.access   = XML->sys.core[ithCore].ROB_reads;
+                                ROB->stats_t.writeAc.access  = XML->sys.core[ithCore].ROB_writes;
+                                /* ROB need to be updated in RS based OOO when new values are produced,
+                                 * this update may happen before the commit stage when ROB entry is released
+                                 * 1. ROB write at instruction inserted in
+                                 * 2. ROB write as results produced (for RS based OOO only)
+                                 * 3. ROB read  as instruction committed. For RS based OOO, data values are read out and sent to ARF
+                                 * For Physical reg based OOO, no data stored in ROB, but register tags need to be
+                                 * read out and used to set the RRAT and to recycle the register tag to free list buffer
+                                 */
+                                ROB->rtp_stats        = ROB->stats_t;
+                        }
+
+                }
+                else if (coredynp.multithreaded)
+                {
+                        int_inst_window->stats_t.readAc.access    = XML->sys.core[ithCore].int_instructions + XML->sys.core[ithCore].fp_instructions;
+                        int_inst_window->stats_t.writeAc.access   = XML->sys.core[ithCore].int_instructions + XML->sys.core[ithCore].fp_instructions;
+                        int_inst_window->stats_t.searchAc.access  = 2*(XML->sys.core[ithCore].int_instructions + XML->sys.core[ithCore].fp_instructions);
+                        int_inst_window->rtp_stats                = int_inst_window->stats_t;
+                }
+    }
+
+        //computation engine
+        if (coredynp.core_ty==OOO)
+        {
+                int_inst_window->power_t.reset();
+                fp_inst_window->power_t.reset();
+
+                /* each instruction needs to write to scheduler, read out when all resources and source operands are ready
+                 * two search ops with one for each source operand
+                 *
+                 */
+                int_inst_window->power_t.readOp.dynamic  +=  int_inst_window->local_result.power.readOp.dynamic * int_inst_window->stats_t.readAc.access
+                                        + int_inst_window->local_result.power.searchOp.dynamic * int_inst_window->stats_t.searchAc.access
+                                        + int_inst_window->local_result.power.writeOp.dynamic  * int_inst_window->stats_t.writeAc.access
+                                        + int_inst_window->stats_t.readAc.access * instruction_selection->power.readOp.dynamic;
+
+                fp_inst_window->power_t.readOp.dynamic   +=  fp_inst_window->local_result.power.readOp.dynamic * fp_inst_window->stats_t.readAc.access
+                                        + fp_inst_window->local_result.power.searchOp.dynamic * fp_inst_window->stats_t.searchAc.access
+                                        + fp_inst_window->local_result.power.writeOp.dynamic * fp_inst_window->stats_t.writeAc.access
+                                        + fp_inst_window->stats_t.writeAc.access * instruction_selection->power.readOp.dynamic;
+
+                if (XML->sys.core[ithCore].ROB_size >0)
+                {
+                        ROB->power_t.reset();
+                        ROB->power_t.readOp.dynamic   +=  ROB->local_result.power.readOp.dynamic*ROB->stats_t.readAc.access +
+                                                ROB->stats_t.writeAc.access*ROB->local_result.power.writeOp.dynamic;
+                }
+
+
+
+
+        }
+        else if (coredynp.multithreaded)
+        {
+                int_inst_window->power_t.reset();
+                int_inst_window->power_t.readOp.dynamic  +=  int_inst_window->local_result.power.readOp.dynamic * int_inst_window->stats_t.readAc.access
+                                                  + int_inst_window->local_result.power.searchOp.dynamic * int_inst_window->stats_t.searchAc.access
+                                          + int_inst_window->local_result.power.writeOp.dynamic  * int_inst_window->stats_t.writeAc.access
+                                          + int_inst_window->stats_t.writeAc.access * instruction_selection->power.readOp.dynamic;
+        }
+
+        //assign values
+        if (is_tdp)
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        int_inst_window->power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg;
+                        fp_inst_window->power = fp_inst_window->power_t + (fp_inst_window->local_result.power +instruction_selection->power) *pppm_lkg;
+                        power     = power + int_inst_window->power + fp_inst_window->power;
+                        if (XML->sys.core[ithCore].ROB_size >0)
+                        {
+                                ROB->power = ROB->power_t + ROB->local_result.power*pppm_lkg;
+                                power     = power + ROB->power;
+                        }
+
+                }
+                else if (coredynp.multithreaded)
+                {
+                        //                     set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1);
+                        int_inst_window->power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg;
+                        power     = power + int_inst_window->power;
+        }
+
+     }
+    else
+    {//rtp
+                if (coredynp.core_ty==OOO)
+                {
+                        int_inst_window->rt_power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg;
+                        fp_inst_window->rt_power  = fp_inst_window->power_t + (fp_inst_window->local_result.power +instruction_selection->power) *pppm_lkg;
+                        rt_power                     = rt_power + int_inst_window->rt_power + fp_inst_window->rt_power;
+                        if (XML->sys.core[ithCore].ROB_size >0)
+                        {
+                                ROB->rt_power = ROB->power_t + ROB->local_result.power*pppm_lkg;
+                                rt_power                     = rt_power + ROB->rt_power;
+                        }
+
+                }
+                else if (coredynp.multithreaded)
+                {
+                        //                     set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1);
+                        int_inst_window->rt_power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg;
+                        rt_power                     = rt_power + int_inst_window->rt_power;
+        }
+    }
+//     set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1);
+//     cout<<"Scheduler power="<<power.readOp.dynamic<<"leakage="<<power.readOp.leakage<<endl;
+//     cout<<"IW="<<int_inst_window->local_result.power.searchOp.dynamic * int_inst_window->stats_t.readAc.access +
+//    + int_inst_window->local_result.power.writeOp.dynamic * int_inst_window->stats_t.writeAc.access<<"leakage="<<int_inst_window->local_result.power.readOp.leakage<<endl;
+//     cout<<"selection"<<instruction_selection->power.readOp.dynamic<<"leakage"<<instruction_selection->power.readOp.leakage<<endl;
+}
+
+void SchedulerU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+
+        if (is_tdp)
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        cout << indent_str << "Instruction Window:" << endl;
+                        cout << indent_str_next << "Area = " << int_inst_window->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << int_inst_window->power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? int_inst_window->power.readOp.longer_channel_leakage:int_inst_window->power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << int_inst_window->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << int_inst_window->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        cout << indent_str << "FP Instruction Window:" << endl;
+                        cout << indent_str_next << "Area = " << fp_inst_window->area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << fp_inst_window->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? fp_inst_window->power.readOp.longer_channel_leakage:fp_inst_window->power.readOp.leakage ) << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << fp_inst_window->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << fp_inst_window->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        if (XML->sys.core[ithCore].ROB_size >0)
+                        {
+                                cout << indent_str<<"ROB:" << endl;
+                                cout << indent_str_next << "Area = " << ROB->area.get_area() *1e-6 << " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << ROB->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? ROB->power.readOp.longer_channel_leakage:ROB->power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << ROB->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << ROB->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                        }
+                }
+                else if (coredynp.multithreaded)
+                {
+                        cout << indent_str << "Instruction Window:" << endl;
+                        cout << indent_str_next << "Area = " << int_inst_window->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << int_inst_window->power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? int_inst_window->power.readOp.longer_channel_leakage:int_inst_window->power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << int_inst_window->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << int_inst_window->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                }
+        }
+        else
+        {
+                if (coredynp.core_ty==OOO)
+                {
+                        cout << indent_str_next << "Instruction Window    Peak Dynamic = " << int_inst_window->rt_power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Instruction Window    Subthreshold Leakage = " << int_inst_window->rt_power.readOp.leakage <<" W" << endl;
+                        cout << indent_str_next << "Instruction Window    Gate Leakage = " << int_inst_window->rt_power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "FP Instruction Window   Peak Dynamic = " << fp_inst_window->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "FP Instruction Window   Subthreshold Leakage = " << fp_inst_window->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "FP Instruction Window   Gate Leakage = " << fp_inst_window->rt_power.readOp.gate_leakage  << " W" << endl;
+                        if (XML->sys.core[ithCore].ROB_size >0)
+                        {
+                                cout << indent_str_next << "ROB   Peak Dynamic = " << ROB->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "ROB   Subthreshold Leakage = " << ROB->rt_power.readOp.leakage  << " W" << endl;
+                                cout << indent_str_next << "ROB   Gate Leakage = " << ROB->rt_power.readOp.gate_leakage  << " W" << endl;
+                        }
+                }
+                else if (coredynp.multithreaded)
+                {
+                        cout << indent_str_next << "Instruction Window    Peak Dynamic = " << int_inst_window->rt_power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Instruction Window    Subthreshold Leakage = " << int_inst_window->rt_power.readOp.leakage <<" W" << endl;
+                        cout << indent_str_next << "Instruction Window    Gate Leakage = " << int_inst_window->rt_power.readOp.gate_leakage << " W" << endl;
+                }
+        }
+
+}
+
+void LoadStoreU::computeEnergy(bool is_tdp)
+{
+        if (!exist) return;
+        if (is_tdp)
+            {
+                //init stats for Peak
+                dcache.caches->stats_t.readAc.access  = 0.67*dcache.caches->l_ip.num_rw_ports*coredynp.LSU_duty_cycle;
+                dcache.caches->stats_t.readAc.miss    = 0;
+                dcache.caches->stats_t.readAc.hit     = dcache.caches->stats_t.readAc.access - dcache.caches->stats_t.readAc.miss;
+                dcache.caches->stats_t.writeAc.access = 0.33*dcache.caches->l_ip.num_rw_ports*coredynp.LSU_duty_cycle;
+                dcache.caches->stats_t.writeAc.miss   = 0;
+                dcache.caches->stats_t.writeAc.hit    = dcache.caches->stats_t.writeAc.access -        dcache.caches->stats_t.writeAc.miss;
+                dcache.caches->tdp_stats = dcache.caches->stats_t;
+
+                dcache.missb->stats_t.readAc.access  = dcache.missb->l_ip.num_search_ports;
+                dcache.missb->stats_t.writeAc.access = dcache.missb->l_ip.num_search_ports;
+                dcache.missb->tdp_stats = dcache.missb->stats_t;
+
+                dcache.ifb->stats_t.readAc.access  = dcache.ifb->l_ip.num_search_ports;
+                dcache.ifb->stats_t.writeAc.access = dcache.ifb->l_ip.num_search_ports;
+                dcache.ifb->tdp_stats = dcache.ifb->stats_t;
+
+                dcache.prefetchb->stats_t.readAc.access  = dcache.prefetchb->l_ip.num_search_ports;
+                dcache.prefetchb->stats_t.writeAc.access = dcache.ifb->l_ip.num_search_ports;
+                dcache.prefetchb->tdp_stats = dcache.prefetchb->stats_t;
+                if (cache_p==Write_back)
+                {
+                        dcache.wbb->stats_t.readAc.access  = dcache.wbb->l_ip.num_search_ports;
+                        dcache.wbb->stats_t.writeAc.access = dcache.wbb->l_ip.num_search_ports;
+                        dcache.wbb->tdp_stats = dcache.wbb->stats_t;
+                }
+
+                LSQ->stats_t.readAc.access = LSQ->stats_t.writeAc.access = LSQ->l_ip.num_search_ports*coredynp.LSU_duty_cycle;
+                LSQ->tdp_stats = LSQ->stats_t;
+                if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0))
+                {
+                        LoadQ->stats_t.readAc.access = LoadQ->stats_t.writeAc.access = LoadQ->l_ip.num_search_ports*coredynp.LSU_duty_cycle;
+                        LoadQ->tdp_stats = LoadQ->stats_t;
+                }
+            }
+            else
+            {
+                //init stats for Runtime Dynamic (RTP)
+                dcache.caches->stats_t.readAc.access  = XML->sys.core[ithCore].dcache.read_accesses;
+                dcache.caches->stats_t.readAc.miss    = XML->sys.core[ithCore].dcache.read_misses;
+                dcache.caches->stats_t.readAc.hit     = dcache.caches->stats_t.readAc.access - dcache.caches->stats_t.readAc.miss;
+                dcache.caches->stats_t.writeAc.access = XML->sys.core[ithCore].dcache.write_accesses;
+                dcache.caches->stats_t.writeAc.miss   = XML->sys.core[ithCore].dcache.write_misses;
+                dcache.caches->stats_t.writeAc.hit    = dcache.caches->stats_t.writeAc.access -        dcache.caches->stats_t.writeAc.miss;
+                dcache.caches->rtp_stats = dcache.caches->stats_t;
+
+                if (cache_p==Write_back)
+                {
+                        dcache.missb->stats_t.readAc.access  = dcache.caches->stats_t.writeAc.miss;
+                        dcache.missb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss;
+                        dcache.missb->rtp_stats = dcache.missb->stats_t;
+
+                        dcache.ifb->stats_t.readAc.access  = dcache.caches->stats_t.writeAc.miss;
+                        dcache.ifb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss;
+                        dcache.ifb->rtp_stats = dcache.ifb->stats_t;
+
+                        dcache.prefetchb->stats_t.readAc.access  = dcache.caches->stats_t.writeAc.miss;
+                        dcache.prefetchb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss;
+                        dcache.prefetchb->rtp_stats = dcache.prefetchb->stats_t;
+
+                        dcache.wbb->stats_t.readAc.access  = dcache.caches->stats_t.writeAc.miss;
+                        dcache.wbb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss;
+                        dcache.wbb->rtp_stats = dcache.wbb->stats_t;
+                }
+                else
+                {
+                        dcache.missb->stats_t.readAc.access  = dcache.caches->stats_t.readAc.miss;
+                        dcache.missb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss;
+                        dcache.missb->rtp_stats = dcache.missb->stats_t;
+
+                        dcache.ifb->stats_t.readAc.access  = dcache.caches->stats_t.readAc.miss;
+                        dcache.ifb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss;
+                        dcache.ifb->rtp_stats = dcache.ifb->stats_t;
+
+                        dcache.prefetchb->stats_t.readAc.access  = dcache.caches->stats_t.readAc.miss;
+                        dcache.prefetchb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss;
+                        dcache.prefetchb->rtp_stats = dcache.prefetchb->stats_t;
+                }
+
+                LSQ->stats_t.readAc.access  = (XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions)*2;//flush overhead considered
+                LSQ->stats_t.writeAc.access = (XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions)*2;
+                LSQ->rtp_stats = LSQ->stats_t;
+
+                if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0))
+                {
+                        LoadQ->stats_t.readAc.access  = XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions;
+                        LoadQ->stats_t.writeAc.access = XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions;
+                        LoadQ->rtp_stats = LoadQ->stats_t;
+                }
+
+            }
+
+        dcache.power_t.reset();
+        LSQ->power_t.reset();
+    dcache.power_t.readOp.dynamic      += (dcache.caches->stats_t.readAc.hit*dcache.caches->local_result.power.readOp.dynamic+
+                dcache.caches->stats_t.readAc.miss*dcache.caches->local_result.power.readOp.dynamic+
+                dcache.caches->stats_t.writeAc.miss*dcache.caches->local_result.tag_array2->power.readOp.dynamic+
+                dcache.caches->stats_t.writeAc.access*dcache.caches->local_result.power.writeOp.dynamic);
+
+    if (cache_p==Write_back)
+    {//write miss will generate a write later
+        dcache.power_t.readOp.dynamic  += dcache.caches->stats_t.writeAc.miss*dcache.caches->local_result.power.writeOp.dynamic;
+    }
+
+    dcache.power_t.readOp.dynamic      +=  dcache.missb->stats_t.readAc.access*dcache.missb->local_result.power.searchOp.dynamic +
+            dcache.missb->stats_t.writeAc.access*dcache.missb->local_result.power.writeOp.dynamic;//each access to missb involves a CAM and a write
+    dcache.power_t.readOp.dynamic      +=  dcache.ifb->stats_t.readAc.access*dcache.ifb->local_result.power.searchOp.dynamic +
+            dcache.ifb->stats_t.writeAc.access*dcache.ifb->local_result.power.writeOp.dynamic;
+    dcache.power_t.readOp.dynamic      +=  dcache.prefetchb->stats_t.readAc.access*dcache.prefetchb->local_result.power.searchOp.dynamic +
+            dcache.prefetchb->stats_t.writeAc.access*dcache.prefetchb->local_result.power.writeOp.dynamic;
+    if (cache_p==Write_back)
+    {
+        dcache.power_t.readOp.dynamic  +=  dcache.wbb->stats_t.readAc.access*dcache.wbb->local_result.power.searchOp.dynamic
+                        + dcache.wbb->stats_t.writeAc.access*dcache.wbb->local_result.power.writeOp.dynamic;
+    }
+
+    if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0))
+    {
+        LoadQ->power_t.reset();
+        LoadQ->power_t.readOp.dynamic  +=  LoadQ->stats_t.readAc.access*(LoadQ->local_result.power.searchOp.dynamic+ LoadQ->local_result.power.readOp.dynamic)+
+                LoadQ->stats_t.writeAc.access*LoadQ->local_result.power.writeOp.dynamic;//every memory access invloves at least two operations on LoadQ
+
+        LSQ->power_t.readOp.dynamic  +=  LSQ->stats_t.readAc.access*(LSQ->local_result.power.searchOp.dynamic + LSQ->local_result.power.readOp.dynamic)
+                        + LSQ->stats_t.writeAc.access*LSQ->local_result.power.writeOp.dynamic;//every memory access invloves at least two operations on LSQ
+
+    }
+    else
+    {
+        LSQ->power_t.readOp.dynamic  +=  LSQ->stats_t.readAc.access*(LSQ->local_result.power.searchOp.dynamic + LSQ->local_result.power.readOp.dynamic)
+                        + LSQ->stats_t.writeAc.access*LSQ->local_result.power.writeOp.dynamic;//every memory access invloves at least two operations on LSQ
+
+    }
+
+    if (is_tdp)
+    {
+//     dcache.power = dcache.power_t + (dcache.caches->local_result.power)*pppm_lkg +
+//                     (dcache.missb->local_result.power +
+//                     dcache.ifb->local_result.power +
+//                     dcache.prefetchb->local_result.power +
+//                     dcache.wbb->local_result.power)*pppm_Isub;
+        dcache.power = dcache.power_t + (dcache.caches->local_result.power +
+                        dcache.missb->local_result.power +
+                        dcache.ifb->local_result.power +
+                        dcache.prefetchb->local_result.power) *pppm_lkg;
+        if (cache_p==Write_back)
+        {
+                dcache.power = dcache.power + dcache.wbb->local_result.power*pppm_lkg;
+        }
+
+        LSQ->power = LSQ->power_t + LSQ->local_result.power *pppm_lkg;
+        power     = power + dcache.power + LSQ->power;
+
+        if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0))
+        {
+                LoadQ->power = LoadQ->power_t + LoadQ->local_result.power *pppm_lkg;
+                power     = power + LoadQ->power;
+        }
+    }
+    else
+    {
+//     dcache.rt_power = dcache.power_t + (dcache.caches->local_result.power +
+//                     dcache.missb->local_result.power +
+//                     dcache.ifb->local_result.power +
+//                     dcache.prefetchb->local_result.power +
+//                     dcache.wbb->local_result.power)*pppm_lkg;
+        dcache.rt_power = dcache.power_t + (dcache.caches->local_result.power +
+                        dcache.missb->local_result.power +
+                        dcache.ifb->local_result.power +
+                        dcache.prefetchb->local_result.power )*pppm_lkg;
+
+        if (cache_p==Write_back)
+        {
+                dcache.rt_power = dcache.rt_power + dcache.wbb->local_result.power*pppm_lkg;
+        }
+
+        LSQ->rt_power = LSQ->power_t + LSQ->local_result.power *pppm_lkg;
+        rt_power     = rt_power + dcache.rt_power + LSQ->rt_power;
+
+        if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0))
+        {
+                LoadQ->rt_power = LoadQ->power_t + LoadQ->local_result.power *pppm_lkg;
+                rt_power     = rt_power + LoadQ->rt_power;
+        }
+    }
+}
+
+
+void LoadStoreU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+
+        if (is_tdp)
+        {
+                cout << indent_str << "Data Cache:" << endl;
+                cout << indent_str_next << "Area = " << dcache.area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << dcache.power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? dcache.power.readOp.longer_channel_leakage:dcache.power.readOp.leakage )<<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << dcache.power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << dcache.rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                if (coredynp.core_ty==Inorder)
+                {
+                        cout << indent_str << "Load/Store Queue:" << endl;
+                        cout << indent_str_next << "Area = " << LSQ->area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << LSQ->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? LSQ->power.readOp.longer_channel_leakage:LSQ->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << LSQ->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << LSQ->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                }
+                else
+
+                {
+                        if (XML->sys.core[ithCore].load_buffer_size >0)
+                        {
+                                cout << indent_str << "LoadQ:" << endl;
+                                cout << indent_str_next << "Area = " << LoadQ->area.get_area() *1e-6 << " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << LoadQ->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? LoadQ->power.readOp.longer_channel_leakage:LoadQ->power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << LoadQ->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << LoadQ->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                        }
+                        cout << indent_str<< "StoreQ:" << endl;
+                        cout << indent_str_next << "Area = " << LSQ->area.get_area()  *1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << LSQ->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? LSQ->power.readOp.longer_channel_leakage:LSQ->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << LSQ->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << LSQ->rt_power.readOp.dynamic/executionTime<< " W" << endl;
+                        cout <<endl;
+                }
+        }
+        else
+        {
+                cout << indent_str_next << "Data Cache    Peak Dynamic = " << dcache.rt_power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Data Cache    Subthreshold Leakage = " << dcache.rt_power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Data Cache    Gate Leakage = " << dcache.rt_power.readOp.gate_leakage << " W" << endl;
+                if (coredynp.core_ty==Inorder)
+                {
+                        cout << indent_str_next << "Load/Store Queue   Peak Dynamic = " << LSQ->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Load/Store Queue   Subthreshold Leakage = " << LSQ->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "Load/Store Queue   Gate Leakage = " << LSQ->rt_power.readOp.gate_leakage  << " W" << endl;
+                }
+                else
+                {
+                        cout << indent_str_next << "LoadQ   Peak Dynamic = " << LoadQ->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "LoadQ   Subthreshold Leakage = " << LoadQ->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "LoadQ   Gate Leakage = " << LoadQ->rt_power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "StoreQ   Peak Dynamic = " << LSQ->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "StoreQ   Subthreshold Leakage = " << LSQ->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "StoreQ   Gate Leakage = " << LSQ->rt_power.readOp.gate_leakage  << " W" << endl;
+                }
+        }
+
+}
+
+void MemManU::computeEnergy(bool is_tdp)
+{
+
+        if (!exist) return;
+        if (is_tdp)
+    {
+        //init stats for Peak
+        itlb->stats_t.readAc.access  = itlb->l_ip.num_search_ports;
+        itlb->stats_t.readAc.miss    = 0;
+        itlb->stats_t.readAc.hit     = itlb->stats_t.readAc.access - itlb->stats_t.readAc.miss;
+        itlb->tdp_stats = itlb->stats_t;
+
+        dtlb->stats_t.readAc.access  = dtlb->l_ip.num_search_ports*coredynp.LSU_duty_cycle;
+        dtlb->stats_t.readAc.miss    = 0;
+        dtlb->stats_t.readAc.hit     = dtlb->stats_t.readAc.access - dtlb->stats_t.readAc.miss;
+        dtlb->tdp_stats = dtlb->stats_t;
+     }
+    else
+    {
+        //init stats for Runtime Dynamic (RTP)
+        itlb->stats_t.readAc.access  = XML->sys.core[ithCore].itlb.total_accesses;
+        itlb->stats_t.readAc.miss    = XML->sys.core[ithCore].itlb.total_misses;
+        itlb->stats_t.readAc.hit     = itlb->stats_t.readAc.access - itlb->stats_t.readAc.miss;
+        itlb->rtp_stats = itlb->stats_t;
+
+        dtlb->stats_t.readAc.access  = XML->sys.core[ithCore].dtlb.total_accesses;
+        dtlb->stats_t.readAc.miss    = XML->sys.core[ithCore].dtlb.total_misses;
+        dtlb->stats_t.readAc.hit     = dtlb->stats_t.readAc.access - dtlb->stats_t.readAc.miss;
+        dtlb->rtp_stats = dtlb->stats_t;
+    }
+
+    itlb->power_t.reset();
+    dtlb->power_t.reset();
+        itlb->power_t.readOp.dynamic +=  itlb->stats_t.readAc.access*itlb->local_result.power.searchOp.dynamic//FA spent most power in tag, so use total access not hits
+                              +itlb->stats_t.readAc.miss*itlb->local_result.power.writeOp.dynamic;
+        dtlb->power_t.readOp.dynamic +=  dtlb->stats_t.readAc.access*dtlb->local_result.power.searchOp.dynamic//FA spent most power in tag, so use total access not hits
+                              +dtlb->stats_t.readAc.miss*dtlb->local_result.power.writeOp.dynamic;
+
+        if (is_tdp)
+            {
+                itlb->power = itlb->power_t + itlb->local_result.power *pppm_lkg;
+                dtlb->power = dtlb->power_t + dtlb->local_result.power *pppm_lkg;
+                power     = power + itlb->power + dtlb->power;
+            }
+            else
+            {
+                        itlb->rt_power = itlb->power_t + itlb->local_result.power *pppm_lkg;
+                        dtlb->rt_power = dtlb->power_t + dtlb->local_result.power *pppm_lkg;
+                        rt_power     = rt_power + itlb->rt_power + dtlb->rt_power;
+            }
+}
+
+void MemManU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+
+
+
+        if (is_tdp)
+        {
+                cout << indent_str << "Itlb:" << endl;
+                cout << indent_str_next << "Area = " << itlb->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << itlb->power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? itlb->power.readOp.longer_channel_leakage:itlb->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << itlb->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << itlb->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str<< "Dtlb:" << endl;
+                cout << indent_str_next << "Area = " << dtlb->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << dtlb->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? dtlb->power.readOp.longer_channel_leakage:dtlb->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << dtlb->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << dtlb->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+                cout << indent_str_next << "Itlb    Peak Dynamic = " << itlb->rt_power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Itlb    Subthreshold Leakage = " << itlb->rt_power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Itlb    Gate Leakage = " << itlb->rt_power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Dtlb   Peak Dynamic = " << dtlb->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Dtlb   Subthreshold Leakage = " << dtlb->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Dtlb   Gate Leakage = " << dtlb->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+
+}
+
+void RegFU::computeEnergy(bool is_tdp)
+{
+/*
+ * Architecture RF and physical RF cannot be present at the same time.
+ * Therefore, the RF stats can only refer to either ARF or PRF;
+ * And the same stats can be used for both.
+ */
+        if (!exist) return;
+        if (is_tdp)
+    {
+        //init stats for Peak
+        IRF->stats_t.readAc.access  = coredynp.issueW*2*(coredynp.ALU_duty_cycle*1.1+
+                        (coredynp.num_muls>0?coredynp.MUL_duty_cycle:0))*coredynp.num_pipelines;
+        IRF->stats_t.writeAc.access  = coredynp.issueW*(coredynp.ALU_duty_cycle*1.1+
+                        (coredynp.num_muls>0?coredynp.MUL_duty_cycle:0))*coredynp.num_pipelines;
+        //Rule of Thumb: about 10% RF related instructions do not need to access ALUs
+        IRF->tdp_stats = IRF->stats_t;
+
+        FRF->stats_t.readAc.access  = FRF->l_ip.num_rd_ports*coredynp.FPU_duty_cycle*1.05*coredynp.num_fp_pipelines;
+        FRF->stats_t.writeAc.access  = FRF->l_ip.num_wr_ports*coredynp.FPU_duty_cycle*1.05*coredynp.num_fp_pipelines;
+        FRF->tdp_stats = FRF->stats_t;
+        if (coredynp.regWindowing)
+        {
+                RFWIN->stats_t.readAc.access  = 0;//0.5*RFWIN->l_ip.num_rw_ports;
+                RFWIN->stats_t.writeAc.access  = 0;//0.5*RFWIN->l_ip.num_rw_ports;
+                RFWIN->tdp_stats = RFWIN->stats_t;
+        }
+     }
+    else
+    {
+        //init stats for Runtime Dynamic (RTP)
+        IRF->stats_t.readAc.access  = XML->sys.core[ithCore].int_regfile_reads;//TODO: no diff on archi and phy
+        IRF->stats_t.writeAc.access  = XML->sys.core[ithCore].int_regfile_writes;
+        IRF->rtp_stats = IRF->stats_t;
+
+        FRF->stats_t.readAc.access  = XML->sys.core[ithCore].float_regfile_reads;
+        FRF->stats_t.writeAc.access  = XML->sys.core[ithCore].float_regfile_writes;
+        FRF->rtp_stats = FRF->stats_t;
+        if (coredynp.regWindowing)
+        {
+                RFWIN->stats_t.readAc.access  = XML->sys.core[ithCore].function_calls*16;
+                RFWIN->stats_t.writeAc.access  = XML->sys.core[ithCore].function_calls*16;
+                RFWIN->rtp_stats = RFWIN->stats_t;
+
+                IRF->stats_t.readAc.access  = XML->sys.core[ithCore].int_regfile_reads +
+                     XML->sys.core[ithCore].function_calls*16;
+                IRF->stats_t.writeAc.access  = XML->sys.core[ithCore].int_regfile_writes +
+                     XML->sys.core[ithCore].function_calls*16;
+                IRF->rtp_stats = IRF->stats_t;
+
+                FRF->stats_t.readAc.access  = XML->sys.core[ithCore].float_regfile_reads +
+                     XML->sys.core[ithCore].function_calls*16;;
+                FRF->stats_t.writeAc.access  = XML->sys.core[ithCore].float_regfile_writes+
+                     XML->sys.core[ithCore].function_calls*16;;
+                FRF->rtp_stats = FRF->stats_t;
+        }
+    }
+        IRF->power_t.reset();
+        FRF->power_t.reset();
+        IRF->power_t.readOp.dynamic  +=  (IRF->stats_t.readAc.access*IRF->local_result.power.readOp.dynamic
+                        +IRF->stats_t.writeAc.access*IRF->local_result.power.writeOp.dynamic);
+        FRF->power_t.readOp.dynamic  +=  (FRF->stats_t.readAc.access*FRF->local_result.power.readOp.dynamic
+                        +FRF->stats_t.writeAc.access*FRF->local_result.power.writeOp.dynamic);
+        if (coredynp.regWindowing)
+        {
+                RFWIN->power_t.reset();
+                RFWIN->power_t.readOp.dynamic   +=  (RFWIN->stats_t.readAc.access*RFWIN->local_result.power.readOp.dynamic +
+                                RFWIN->stats_t.writeAc.access*RFWIN->local_result.power.writeOp.dynamic);
+        }
+
+        if (is_tdp)
+        {
+                IRF->power  =  IRF->power_t + IRF->local_result.power *coredynp.pppm_lkg_multhread;
+                FRF->power  =  FRF->power_t + FRF->local_result.power *coredynp.pppm_lkg_multhread;
+                power      =  power + (IRF->power + FRF->power);
+                if (coredynp.regWindowing)
+                {
+                        RFWIN->power = RFWIN->power_t + RFWIN->local_result.power *pppm_lkg;
+                        power        = power + RFWIN->power;
+                }
+        }
+        else
+        {
+                IRF->rt_power  =  IRF->power_t + IRF->local_result.power *coredynp.pppm_lkg_multhread;
+                FRF->rt_power  =  FRF->power_t + FRF->local_result.power *coredynp.pppm_lkg_multhread;
+                rt_power          =  rt_power + (IRF->power_t + FRF->power_t);
+                if (coredynp.regWindowing)
+                {
+                        RFWIN->rt_power = RFWIN->power_t + RFWIN->local_result.power *pppm_lkg;
+                        rt_power        = rt_power + RFWIN->rt_power;
+                }
+        }
+}
+
+
+void RegFU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {      cout << indent_str << "Integer RF:" << endl;
+                cout << indent_str_next << "Area = " << IRF->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << IRF->power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? IRF->power.readOp.longer_channel_leakage:IRF->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << IRF->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << IRF->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str<< "Floating Point RF:" << endl;
+                cout << indent_str_next << "Area = " << FRF->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << FRF->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? FRF->power.readOp.longer_channel_leakage:FRF->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << FRF->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << FRF->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                if (coredynp.regWindowing)
+                {
+                        cout << indent_str << "Register Windows:" << endl;
+                        cout << indent_str_next << "Area = " << RFWIN->area.get_area() *1e-6 << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << RFWIN->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? RFWIN->power.readOp.longer_channel_leakage:RFWIN->power.readOp.leakage)  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << RFWIN->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << RFWIN->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                }
+        }
+        else
+        {
+                cout << indent_str_next << "Integer RF    Peak Dynamic = " << IRF->rt_power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Integer RF    Subthreshold Leakage = " << IRF->rt_power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Integer RF    Gate Leakage = " << IRF->rt_power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Floating Point RF   Peak Dynamic = " << FRF->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Floating Point RF   Subthreshold Leakage = " << FRF->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Floating Point RF   Gate Leakage = " << FRF->rt_power.readOp.gate_leakage  << " W" << endl;
+                if (coredynp.regWindowing)
+                {
+                        cout << indent_str_next << "Register Windows   Peak Dynamic = " << RFWIN->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Register Windows   Subthreshold Leakage = " << RFWIN->rt_power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next << "Register Windows   Gate Leakage = " << RFWIN->rt_power.readOp.gate_leakage  << " W" << endl;
+                }
+        }
+}
+
+
+void EXECU::computeEnergy(bool is_tdp)
+{
+        if (!exist) return;
+        double pppm_t[4]    = {1,1,1,1};
+//     rfu->power.reset();
+//     rfu->rt_power.reset();
+//     scheu->power.reset();
+//     scheu->rt_power.reset();
+//     exeu->power.reset();
+//     exeu->rt_power.reset();
+
+        rfu->computeEnergy(is_tdp);
+        scheu->computeEnergy(is_tdp);
+        exeu->computeEnergy(is_tdp);
+        if (coredynp.num_fpus >0)
+        {
+                fp_u->computeEnergy(is_tdp);
+        }
+        if (coredynp.num_muls >0)
+        {
+                mul->computeEnergy(is_tdp);
+        }
+
+        if (is_tdp)
+        {
+                set_pppm(pppm_t, 2*coredynp.ALU_cdb_duty_cycle, 2, 2, 2*coredynp.ALU_cdb_duty_cycle);//2 means two source operands needs to be passed for each int instruction.
+                bypass.power = bypass.power + intTagBypass->power*pppm_t + int_bypass->power*pppm_t;
+                if (coredynp.num_muls >0)
+                {
+                        set_pppm(pppm_t, 2*coredynp.MUL_cdb_duty_cycle, 2, 2, 2*coredynp.MUL_cdb_duty_cycle);//2 means two source operands needs to be passed for each int instruction.
+                        bypass.power = bypass.power + intTag_mul_Bypass->power*pppm_t + int_mul_bypass->power*pppm_t;
+                        power      = power + mul->power;
+                }
+                if (coredynp.num_fpus>0)
+                {
+                        set_pppm(pppm_t, 3*coredynp.FPU_cdb_duty_cycle, 3, 3, 3*coredynp.FPU_cdb_duty_cycle);//3 means three source operands needs to be passed for each fp instruction.
+                        bypass.power = bypass.power + fp_bypass->power*pppm_t  + fpTagBypass->power*pppm_t ;
+                        power      = power + fp_u->power;
+                }
+
+                power      = power + rfu->power + exeu->power + bypass.power + scheu->power;
+        }
+        else
+        {
+                set_pppm(pppm_t, XML->sys.core[ithCore].cdb_alu_accesses, 2, 2, XML->sys.core[ithCore].cdb_alu_accesses);
+                bypass.rt_power = bypass.rt_power + intTagBypass->power*pppm_t;
+                bypass.rt_power = bypass.rt_power + int_bypass->power*pppm_t;
+
+                if (coredynp.num_muls >0)
+                {
+                        set_pppm(pppm_t, XML->sys.core[ithCore].cdb_mul_accesses, 2, 2, XML->sys.core[ithCore].cdb_mul_accesses);//2 means two source operands needs to be passed for each int instruction.
+                        bypass.rt_power = bypass.rt_power + intTag_mul_Bypass->power*pppm_t + int_mul_bypass->power*pppm_t;
+                        rt_power      = rt_power + mul->rt_power;
+                }
+
+                if (coredynp.num_fpus>0)
+                {
+                        set_pppm(pppm_t, XML->sys.core[ithCore].cdb_fpu_accesses, 3, 3, XML->sys.core[ithCore].cdb_fpu_accesses);
+                        bypass.rt_power = bypass.rt_power + fp_bypass->power*pppm_t;
+                        bypass.rt_power = bypass.rt_power + fpTagBypass->power*pppm_t;
+                        rt_power      = rt_power + fp_u->rt_power;
+                }
+                rt_power      = rt_power + rfu->rt_power + exeu->rt_power + bypass.rt_power + scheu->rt_power;
+        }
+}
+
+void EXECU::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        if (!exist) return;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+
+//     cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl;
+        if (is_tdp)
+        {
+                cout << indent_str << "Register Files:" << endl;
+                cout << indent_str_next << "Area = " << rfu->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << rfu->power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? rfu->power.readOp.longer_channel_leakage:rfu->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << rfu->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << rfu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                if (plevel>3){
+                        rfu->displayEnergy(indent+4,is_tdp);
+                }
+                cout << indent_str << "Instruction Scheduler:" << endl;
+                cout << indent_str_next << "Area = " << scheu->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << scheu->power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? scheu->power.readOp.longer_channel_leakage:scheu->power.readOp.leakage)  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << scheu->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << scheu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+                if (plevel>3){
+                        scheu->displayEnergy(indent+4,is_tdp);
+                }
+                exeu->displayEnergy(indent,is_tdp);
+                if (coredynp.num_fpus>0)
+                {
+                        fp_u->displayEnergy(indent,is_tdp);
+                }
+                if (coredynp.num_muls >0)
+                {
+                        mul->displayEnergy(indent,is_tdp);
+                }
+                cout << indent_str << "Results Broadcast Bus:" << endl;
+                cout << indent_str_next << "Area Overhead = " << bypass.area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << bypass.power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? bypass.power.readOp.longer_channel_leakage:bypass.power.readOp.leakage ) << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << bypass.power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << bypass.rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+                cout << indent_str_next << "Register Files    Peak Dynamic = " << rfu->rt_power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Register Files    Subthreshold Leakage = " << rfu->rt_power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Register Files    Gate Leakage = " << rfu->rt_power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Instruction Sheduler   Peak Dynamic = " << scheu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Instruction Sheduler   Subthreshold Leakage = " << scheu->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Instruction Sheduler   Gate Leakage = " << scheu->rt_power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Results Broadcast Bus   Peak Dynamic = " << bypass.rt_power.readOp.dynamic*clockRate  << " W" << endl;
+                cout << indent_str_next << "Results Broadcast Bus   Subthreshold Leakage = " << bypass.rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Results Broadcast Bus   Gate Leakage = " << bypass.rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+
+}
+
+void Core::computeEnergy(bool is_tdp)
+{
+        //power_point_product_masks
+        double pppm_t[4]    = {1,1,1,1};
+    double rtp_pipeline_coe;
+    double num_units = 4.0;
+        if (is_tdp)
+        {
+                ifu->computeEnergy(is_tdp);
+                lsu->computeEnergy(is_tdp);
+                mmu->computeEnergy(is_tdp);
+                exu->computeEnergy(is_tdp);
+
+                if (coredynp.core_ty==OOO)
+                {
+                        num_units = 5.0;
+                        rnu->computeEnergy(is_tdp);
+                        set_pppm(pppm_t, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+                        if (rnu->exist)
+                        {
+                                rnu->power = rnu->power + corepipe->power*pppm_t;
+                                power     = power + rnu->power;
+                        }
+                }
+
+                if (ifu->exist)
+                {
+                        set_pppm(pppm_t, coredynp.num_pipelines/num_units*coredynp.IFU_duty_cycle, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+//                     cout << "IFU = " << ifu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        ifu->power = ifu->power + corepipe->power*pppm_t;
+//                     cout << "IFU = " << ifu->power.readOp.dynamic*clockRate  << " W" << endl;
+//                     cout << "1/4 pipe = " << corepipe->power.readOp.dynamic*clockRate/num_units  << " W" << endl;
+                        power     = power + ifu->power;
+//                     cout << "core = " << power.readOp.dynamic*clockRate  << " W" << endl;
+                }
+                if (lsu->exist)
+                {
+                        set_pppm(pppm_t, coredynp.num_pipelines/num_units*coredynp.LSU_duty_cycle, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+                        lsu->power = lsu->power + corepipe->power*pppm_t;
+//                     cout << "LSU = " << lsu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        power     = power + lsu->power;
+//                     cout << "core = " << power.readOp.dynamic*clockRate  << " W" << endl;
+                }
+                if (exu->exist)
+                {
+                        set_pppm(pppm_t, coredynp.num_pipelines/num_units*coredynp.ALU_duty_cycle, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+                        exu->power = exu->power + corepipe->power*pppm_t;
+//                     cout << "EXE = " << exu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        power     = power + exu->power;
+//                     cout << "core = " << power.readOp.dynamic*clockRate  << " W" << endl;
+                }
+                if (mmu->exist)
+                {
+                        set_pppm(pppm_t, coredynp.num_pipelines/num_units*(0.5+0.5*coredynp.LSU_duty_cycle), coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+                        mmu->power = mmu->power + corepipe->power*pppm_t;
+//                     cout << "MMU = " << mmu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        power     = power +  mmu->power;
+//                     cout << "core = " << power.readOp.dynamic*clockRate  << " W" << endl;
+                }
+
+                power     = power +  undiffCore->power;
+
+                if (XML->sys.Private_L2)
+                {
+
+                        l2cache->computeEnergy(is_tdp);
+                        set_pppm(pppm_t,l2cache->cachep.clockRate/clockRate, 1,1,1);
+                        //l2cache->power = l2cache->power*pppm_t;
+                        power = power  + l2cache->power*pppm_t;
+                }
+        }
+        else
+        {
+                ifu->computeEnergy(is_tdp);
+                lsu->computeEnergy(is_tdp);
+                mmu->computeEnergy(is_tdp);
+                exu->computeEnergy(is_tdp);
+                if (coredynp.core_ty==OOO)
+                {
+                        num_units = 5.0;
+                        rnu->computeEnergy(is_tdp);
+                set_pppm(pppm_t, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+                        if (rnu->exist)
+                        {
+                rnu->rt_power = rnu->rt_power + corepipe->power*pppm_t;
+
+                        rt_power      = rt_power + rnu->rt_power;
+                        }
+                }
+                else
+                {
+                        if (XML->sys.homogeneous_cores==1)
+                        {
+                                rtp_pipeline_coe = coredynp.pipeline_duty_cycle * XML->sys.total_cycles * XML->sys.number_of_cores;
+                        }
+                        else
+                        {
+                                rtp_pipeline_coe = coredynp.pipeline_duty_cycle * coredynp.total_cycles;
+                        }
+                set_pppm(pppm_t, coredynp.num_pipelines*rtp_pipeline_coe/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units);
+                }
+
+                if (ifu->exist)
+                {
+                        ifu->rt_power = ifu->rt_power + corepipe->power*pppm_t;
+                        rt_power     = rt_power + ifu->rt_power ;
+                }
+                if (lsu->exist)
+                {
+                        lsu->rt_power = lsu->rt_power + corepipe->power*pppm_t;
+                        rt_power     = rt_power  + lsu->rt_power;
+                }
+                if (exu->exist)
+                {
+                        exu->rt_power = exu->rt_power + corepipe->power*pppm_t;
+                        rt_power     = rt_power  + exu->rt_power;
+                }
+                if (mmu->exist)
+                {
+                        mmu->rt_power = mmu->rt_power + corepipe->power*pppm_t;
+                        rt_power     = rt_power +  mmu->rt_power ;
+                }
+
+                rt_power     = rt_power +  undiffCore->power;
+//             cout << "EXE = " << exu->power.readOp.dynamic*clockRate  << " W" << endl;
+                if (XML->sys.Private_L2)
+                {
+
+                        l2cache->computeEnergy(is_tdp);
+                        //set_pppm(pppm_t,1/l2cache->cachep.executionTime, 1,1,1);
+                        //l2cache->rt_power = l2cache->rt_power*pppm_t;
+                        rt_power = rt_power  + l2cache->rt_power;
+                }
+        }
+
+}
+
+void Core::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+        if (is_tdp)
+        {
+                cout << "Core:" << endl;
+                cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str << "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout<<endl;
+                if (ifu->exist)
+                {
+                        cout << indent_str << "Instruction Fetch Unit:" << endl;
+                        cout << indent_str_next << "Area = " << ifu->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << ifu->power.readOp.dynamic*clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? ifu->power.readOp.longer_channel_leakage:ifu->power.readOp.leakage) <<" W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << ifu->power.readOp.longer_channel_leakage <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << ifu->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << ifu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        if (plevel >2){
+                                ifu->displayEnergy(indent+4,plevel,is_tdp);
+                        }
+                }
+                if (coredynp.core_ty==OOO)
+                {
+                        if (rnu->exist)
+                        {
+                                cout << indent_str<< "Renaming Unit:" << endl;
+                                cout << indent_str_next << "Area = " << rnu->area.get_area()*1e-6  << " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << rnu->power.readOp.dynamic*clockRate  << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                        << (long_channel? rnu->power.readOp.longer_channel_leakage:rnu->power.readOp.leakage)  << " W" << endl;
+                                //cout << indent_str_next << "Subthreshold Leakage = " << rnu->power.readOp.longer_channel_leakage  << " W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << rnu->power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << rnu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                                cout <<endl;
+                                if (plevel >2){
+                                        rnu->displayEnergy(indent+4,plevel,is_tdp);
+                                }
+                        }
+
+                }
+                if (lsu->exist)
+                {
+                        cout << indent_str<< "Load Store Unit:" << endl;
+                        cout << indent_str_next << "Area = " << lsu->area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << lsu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? lsu->power.readOp.longer_channel_leakage:lsu->power.readOp.leakage ) << " W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << lsu->power.readOp.longer_channel_leakage  << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << lsu->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << lsu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        if (plevel >2){
+                                lsu->displayEnergy(indent+4,plevel,is_tdp);
+                        }
+                }
+                if (mmu->exist)
+                {
+                        cout << indent_str<< "Memory Management Unit:" << endl;
+                        cout << indent_str_next << "Area = " << mmu->area.get_area() *1e-6 << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << mmu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? mmu->power.readOp.longer_channel_leakage:mmu->power.readOp.leakage)   << " W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << mmu->power.readOp.longer_channel_leakage   << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << mmu->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << mmu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        if (plevel >2){
+                                mmu->displayEnergy(indent+4,plevel,is_tdp);
+                        }
+                }
+                if (exu->exist)
+                {
+                        cout << indent_str<< "Execution Unit:" << endl;
+                        cout << indent_str_next << "Area = " << exu->area.get_area()  *1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << exu->power.readOp.dynamic*clockRate  << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? exu->power.readOp.longer_channel_leakage:exu->power.readOp.leakage)   << " W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << exu->power.readOp.longer_channel_leakage << " W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << exu->power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << exu->rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                        if (plevel >2){
+                                exu->displayEnergy(indent+4,plevel,is_tdp);
+                        }
+                }
+//             if (plevel >2)
+//             {
+//                     if (undiffCore->exist)
+//                     {
+//                             cout << indent_str << "Undifferentiated Core" << endl;
+//                             cout << indent_str_next << "Area = " << undiffCore->area.get_area()*1e-6<< " mm^2" << endl;
+//                             cout << indent_str_next << "Peak Dynamic = " << undiffCore->power.readOp.dynamic*clockRate << " W" << endl;
+////                           cout << indent_str_next << "Subthreshold Leakage = " << undiffCore->power.readOp.leakage <<" W" << endl;
+//                             cout << indent_str_next << "Subthreshold Leakage = "
+//                                                             << (long_channel? undiffCore->power.readOp.longer_channel_leakage:undiffCore->power.readOp.leakage)   << " W" << endl;
+//                             cout << indent_str_next << "Gate Leakage = " << undiffCore->power.readOp.gate_leakage << " W" << endl;
+//                             //              cout << indent_str_next << "Runtime Dynamic = " << undiffCore->rt_power.readOp.dynamic/executionTime << " W" << endl;
+//                             cout <<endl;
+//                     }
+//             }
+                if (XML->sys.Private_L2)
+                {
+
+                        l2cache->displayEnergy(4,is_tdp);
+                }
+
+        }
+        else
+        {
+//             cout << indent_str_next << "Instruction Fetch Unit    Peak Dynamic = " << ifu->rt_power.readOp.dynamic*clockRate << " W" << endl;
+//             cout << indent_str_next << "Instruction Fetch Unit    Subthreshold Leakage = " << ifu->rt_power.readOp.leakage <<" W" << endl;
+//             cout << indent_str_next << "Instruction Fetch Unit    Gate Leakage = " << ifu->rt_power.readOp.gate_leakage << " W" << endl;
+//             cout << indent_str_next << "Load Store Unit   Peak Dynamic = " << lsu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Load Store Unit   Subthreshold Leakage = " << lsu->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Load Store Unit   Gate Leakage = " << lsu->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Memory Management Unit   Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Memory Management Unit   Subthreshold Leakage = " << mmu->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Memory Management Unit   Gate Leakage = " << mmu->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Execution Unit   Peak Dynamic = " << exu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Execution Unit   Subthreshold Leakage = " << exu->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Execution Unit   Gate Leakage = " << exu->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+}
+InstFetchU ::~InstFetchU(){
+
+        if (!exist) return;
+        if(IB)                            {delete IB; IB = 0;}
+        if(ID_inst)               {delete ID_inst; ID_inst = 0;}
+        if(ID_operand)                    {delete ID_operand; ID_operand = 0;}
+        if(ID_misc)               {delete ID_misc; ID_misc = 0;}
+        if (coredynp.predictionW>0)
+        {
+                if(BTB)                       {delete BTB; BTB = 0;}
+                if(BPT)                       {delete BPT; BPT = 0;}
+        }
+}
+
+BranchPredictor ::~BranchPredictor(){
+
+        if (!exist) return;
+        if(globalBPT)             {delete globalBPT; globalBPT = 0;}
+        if(localBPT)              {delete localBPT; localBPT = 0;}
+    if(L1_localBPT)           {delete L1_localBPT; L1_localBPT = 0;}
+    if(L2_localBPT)           {delete L2_localBPT; L2_localBPT = 0;}
+    if(chooser)                   {delete chooser; chooser = 0;}
+    if(RAS)                   {delete RAS; RAS = 0;}
+        }
+
+RENAMINGU ::~RENAMINGU(){
+
+        if (!exist) return;
+        if(iFRAT )                    {delete iFRAT; iFRAT = 0;}
+    if(fFRAT )                        {delete fFRAT; fFRAT =0;}
+    if(iRRAT)                  {delete iRRAT; iRRAT = 0;}
+    if(iFRAT)                  {delete iFRAT; iFRAT = 0;}
+    if(ifreeL)                 {delete ifreeL;ifreeL= 0;}
+    if(ffreeL)                 {delete ffreeL;ffreeL= 0;}
+    if(idcl)                   {delete idcl;  idcl = 0;}
+    if(fdcl)                   {delete fdcl;  fdcl = 0;}
+    if(RAHT)                   {delete RAHT;  RAHT = 0;}
+        }
+
+LoadStoreU ::~LoadStoreU(){
+
+        if (!exist) return;
+        if(LSQ)                       {delete LSQ; LSQ = 0;}
+        }
+
+MemManU ::~MemManU(){
+
+        if (!exist) return;
+        if(itlb)                      {delete itlb; itlb = 0;}
+    if(dtlb)                  {delete dtlb; dtlb = 0;}
+        }
+
+RegFU ::~RegFU(){
+
+        if (!exist) return;
+        if(IRF)                       {delete IRF; IRF = 0;}
+    if(FRF)                   {delete FRF; FRF = 0;}
+    if(RFWIN)                 {delete RFWIN; RFWIN = 0;}
+        }
+
+SchedulerU ::~SchedulerU(){
+
+        if (!exist) return;
+        if(int_inst_window)       {delete int_inst_window; int_inst_window = 0;}
+        if(fp_inst_window)            {delete int_inst_window; int_inst_window = 0;}
+        if(ROB)                       {delete ROB; ROB = 0;}
+    if(instruction_selection)  {delete instruction_selection;instruction_selection = 0;}
+        }
+
+EXECU ::~EXECU(){
+
+        if (!exist) return;
+        if(int_bypass)                    {delete int_bypass; int_bypass = 0;}
+    if(intTagBypass)          {delete intTagBypass; intTagBypass =0;}
+    if(int_mul_bypass)                {delete int_mul_bypass; int_mul_bypass = 0;}
+    if(intTag_mul_Bypass)         {delete intTag_mul_Bypass; intTag_mul_Bypass =0;}
+    if(fp_bypass)                 {delete fp_bypass;fp_bypass = 0;}
+    if(fpTagBypass)           {delete fpTagBypass;fpTagBypass = 0;}
+    if(fp_u)                   {delete fp_u;fp_u = 0;}
+    if(exeu)                   {delete exeu;exeu = 0;}
+    if(mul)                    {delete mul;mul = 0;}
+    if(rfu)                    {delete rfu;rfu = 0;}
+        if(scheu)                     {delete scheu; scheu = 0;}
+        }
+
+Core ::~Core(){
+
+        if(ifu)                       {delete ifu; ifu = 0;}
+        if(lsu)                       {delete lsu; lsu = 0;}
+        if(rnu)                       {delete rnu; rnu = 0;}
+        if(mmu)                       {delete mmu; mmu = 0;}
+        if(exu)                       {delete exu; exu = 0;}
+    if(corepipe)                  {delete corepipe; corepipe = 0;}
+    if(undiffCore)             {delete undiffCore;undiffCore = 0;}
+    if(l2cache)                {delete l2cache;l2cache = 0;}
+        }
+
+void Core::set_core_param()
+{
+        coredynp.opt_local = XML->sys.core[ithCore].opt_local;
+        coredynp.x86 = XML->sys.core[ithCore].x86;
+        coredynp.Embedded = XML->sys.Embedded;
+        coredynp.core_ty   = (enum Core_type)XML->sys.core[ithCore].machine_type;
+        coredynp.rm_ty     = (enum Renaming_type)XML->sys.core[ithCore].rename_scheme;
+    coredynp.fetchW    = XML->sys.core[ithCore].fetch_width;
+    coredynp.decodeW   = XML->sys.core[ithCore].decode_width;
+    coredynp.issueW    = XML->sys.core[ithCore].issue_width;
+    coredynp.peak_issueW   = XML->sys.core[ithCore].peak_issue_width;
+    coredynp.commitW       = XML->sys.core[ithCore].commit_width;
+    coredynp.peak_commitW  = XML->sys.core[ithCore].peak_issue_width;
+    coredynp.predictionW   = XML->sys.core[ithCore].prediction_width;
+    coredynp.fp_issueW     = XML->sys.core[ithCore].fp_issue_width;
+    coredynp.fp_decodeW    = XML->sys.core[ithCore].fp_issue_width;
+    coredynp.num_alus      = XML->sys.core[ithCore].ALU_per_core;
+    coredynp.num_fpus      = XML->sys.core[ithCore].FPU_per_core;
+    coredynp.num_muls      = XML->sys.core[ithCore].MUL_per_core;
+
+
+    coredynp.num_hthreads           = XML->sys.core[ithCore].number_hardware_threads;
+    coredynp.multithreaded       = coredynp.num_hthreads>1? true:false;
+    coredynp.instruction_length  = XML->sys.core[ithCore].instruction_length;
+    coredynp.pc_width            = XML->sys.virtual_address_width;
+
+        coredynp.opcode_length       = XML->sys.core[ithCore].opcode_width;
+    coredynp.micro_opcode_length = XML->sys.core[ithCore].micro_opcode_width;
+    coredynp.num_pipelines       = XML->sys.core[ithCore].pipelines_per_core[0];
+    coredynp.pipeline_stages     = XML->sys.core[ithCore].pipeline_depth[0];
+    coredynp.num_fp_pipelines    = XML->sys.core[ithCore].pipelines_per_core[1];
+    coredynp.fp_pipeline_stages  = XML->sys.core[ithCore].pipeline_depth[1];
+    coredynp.int_data_width      = int(ceil(XML->sys.machine_bits/32.0))*32;
+    coredynp.fp_data_width       = coredynp.int_data_width;
+    coredynp.v_address_width     = XML->sys.virtual_address_width;
+    coredynp.p_address_width     = XML->sys.physical_address_width;
+
+        coredynp.scheu_ty         = (enum Scheduler_type)XML->sys.core[ithCore].instruction_window_scheme;
+        coredynp.arch_ireg_width  =  int(ceil(log2(XML->sys.core[ithCore].archi_Regs_IRF_size)));
+        coredynp.arch_freg_width  =  int(ceil(log2(XML->sys.core[ithCore].archi_Regs_FRF_size)));
+        coredynp.num_IRF_entry    = XML->sys.core[ithCore].archi_Regs_IRF_size;
+        coredynp.num_FRF_entry    = XML->sys.core[ithCore].archi_Regs_FRF_size;
+        coredynp.pipeline_duty_cycle = XML->sys.core[ithCore].pipeline_duty_cycle;
+        coredynp.total_cycles        = XML->sys.core[ithCore].total_cycles;
+        coredynp.busy_cycles         = XML->sys.core[ithCore].busy_cycles;
+        coredynp.idle_cycles         = XML->sys.core[ithCore].idle_cycles;
+
+        //Max power duty cycle for peak power estimation
+//     if (coredynp.core_ty==OOO)
+//     {
+//             coredynp.IFU_duty_cycle = 1;
+//             coredynp.LSU_duty_cycle = 1;
+//             coredynp.MemManU_I_duty_cycle =1;
+//             coredynp.MemManU_D_duty_cycle =1;
+//             coredynp.ALU_duty_cycle =1;
+//             coredynp.MUL_duty_cycle =1;
+//             coredynp.FPU_duty_cycle =1;
+//             coredynp.ALU_cdb_duty_cycle =1;
+//             coredynp.MUL_cdb_duty_cycle =1;
+//             coredynp.FPU_cdb_duty_cycle =1;
+//     }
+//     else
+//     {
+                coredynp.IFU_duty_cycle = XML->sys.core[ithCore].IFU_duty_cycle;
+                coredynp.BR_duty_cycle = XML->sys.core[ithCore].BR_duty_cycle;
+                coredynp.LSU_duty_cycle = XML->sys.core[ithCore].LSU_duty_cycle;
+                coredynp.MemManU_I_duty_cycle = XML->sys.core[ithCore].MemManU_I_duty_cycle;
+                coredynp.MemManU_D_duty_cycle = XML->sys.core[ithCore].MemManU_D_duty_cycle;
+                coredynp.ALU_duty_cycle = XML->sys.core[ithCore].ALU_duty_cycle;
+                coredynp.MUL_duty_cycle = XML->sys.core[ithCore].MUL_duty_cycle;
+                coredynp.FPU_duty_cycle = XML->sys.core[ithCore].FPU_duty_cycle;
+                coredynp.ALU_cdb_duty_cycle = XML->sys.core[ithCore].ALU_cdb_duty_cycle;
+                coredynp.MUL_cdb_duty_cycle = XML->sys.core[ithCore].MUL_cdb_duty_cycle;
+                coredynp.FPU_cdb_duty_cycle = XML->sys.core[ithCore].FPU_cdb_duty_cycle;
+//     }
+
+
+        if (!((coredynp.core_ty==OOO)||(coredynp.core_ty==Inorder)))
+        {
+                cout<<"Invalid Core Type"<<endl;
+                exit(0);
+        }
+//     if (coredynp.core_ty==OOO)
+//     {
+//             cout<<"OOO processor models are being updated and will be available in next release"<<endl;
+//             exit(0);
+//     }
+        if (!((coredynp.scheu_ty==PhysicalRegFile)||(coredynp.scheu_ty==ReservationStation)))
+        {
+                cout<<"Invalid OOO Scheduler Type"<<endl;
+                exit(0);
+        }
+
+        if (!((coredynp.rm_ty ==RAMbased)||(coredynp.rm_ty ==CAMbased)))
+        {
+                cout<<"Invalid OOO Renaming Type"<<endl;
+                exit(0);
+        }
+
+if (coredynp.core_ty==OOO)
+{
+        if (coredynp.scheu_ty==PhysicalRegFile)
+        {
+          coredynp.phy_ireg_width  =  int(ceil(log2(XML->sys.core[ithCore].phy_Regs_IRF_size)));
+          coredynp.phy_freg_width  =  int(ceil(log2(XML->sys.core[ithCore].phy_Regs_FRF_size)));
+          coredynp.num_ifreelist_entries = coredynp.num_IRF_entry  = XML->sys.core[ithCore].phy_Regs_IRF_size;
+          coredynp.num_ffreelist_entries = coredynp.num_FRF_entry  = XML->sys.core[ithCore].phy_Regs_FRF_size;
+        }
+        else if (coredynp.scheu_ty==ReservationStation)
+        {//ROB serves as Phy RF in RS based OOO
+      coredynp.phy_ireg_width  =  int(ceil(log2(XML->sys.core[ithCore].ROB_size)));
+          coredynp.phy_freg_width  =  int(ceil(log2(XML->sys.core[ithCore].ROB_size)));
+          coredynp.num_ifreelist_entries = XML->sys.core[ithCore].ROB_size;
+          coredynp.num_ffreelist_entries = XML->sys.core[ithCore].ROB_size;
+
+        }
+
+}
+        coredynp.globalCheckpoint   =  32;//best check pointing entries for a 4~8 issue OOO should be 16~48;See TR for reference.
+        coredynp.perThreadState     =  8;
+        coredynp.instruction_length = 32;
+        coredynp.clockRate          =  XML->sys.core[ithCore].clock_rate;
+        coredynp.clockRate          *= 1e6;
+        coredynp.regWindowing= (XML->sys.core[ithCore].register_windows_size>0&&coredynp.core_ty==Inorder)?true:false;
+        coredynp.executionTime = XML->sys.total_cycles/coredynp.clockRate;
+        set_pppm(coredynp.pppm_lkg_multhread, 0, coredynp.num_hthreads, coredynp.num_hthreads, 0);
+}
diff --git a/ext/mcpat/core.h b/ext/mcpat/core.h
new file mode 100644 (file)
index 0000000..8ef3bab
--- /dev/null
@@ -0,0 +1,262 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef CORE_H_
+#define CORE_H_
+
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_components.h"
+#include "interconnect.h"
+#include "logic.h"
+#include "parameter.h"
+#include "sharedcache.h"
+
+class BranchPredictor :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        ArrayST * globalBPT;
+        ArrayST * localBPT;
+        ArrayST * L1_localBPT;
+        ArrayST * L2_localBPT;
+        ArrayST * chooser;
+        ArrayST * RAS;
+        bool exist;
+
+        BranchPredictor(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~BranchPredictor();
+};
+
+
+class InstFetchU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        enum Cache_policy cache_p;
+        InstCache icache;
+        ArrayST * IB;
+        ArrayST * BTB;
+        BranchPredictor * BPT;
+        inst_decoder * ID_inst;
+        inst_decoder * ID_operand;
+        inst_decoder * ID_misc;
+        bool exist;
+
+        InstFetchU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~InstFetchU();
+};
+
+
+class SchedulerU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        double Iw_height, fp_Iw_height,ROB_height;
+        ArrayST         * int_inst_window;
+        ArrayST         * fp_inst_window;
+        ArrayST         * ROB;
+    selection_logic * instruction_selection;
+    bool exist;
+
+    SchedulerU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~SchedulerU();
+};
+
+class RENAMINGU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        double clockRate,executionTime;
+        CoreDynParam  coredynp;
+        ArrayST * iFRAT;
+        ArrayST * fFRAT;
+        ArrayST * iRRAT;
+        ArrayST * fRRAT;
+        ArrayST * ifreeL;
+        ArrayST * ffreeL;
+        dep_resource_conflict_check * idcl;
+        dep_resource_conflict_check * fdcl;
+        ArrayST * RAHT;//register alias history table Used to store GC
+        bool exist;
+
+
+        RENAMINGU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~RENAMINGU();
+};
+
+class LoadStoreU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        enum Cache_policy cache_p;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        double lsq_height;
+        DataCache dcache;
+        ArrayST * LSQ;//it is actually the store queue but for inorder processors it serves as both loadQ and StoreQ
+        ArrayST * LoadQ;
+        bool exist;
+
+        LoadStoreU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~LoadStoreU();
+};
+
+class MemManU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        ArrayST * itlb;
+        ArrayST * dtlb;
+        bool exist;
+
+        MemManU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~MemManU();
+};
+
+class RegFU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        double int_regfile_height, fp_regfile_height;
+        ArrayST * IRF;
+        ArrayST * FRF;
+        ArrayST * RFWIN;
+        bool exist;
+
+        RegFU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~RegFU();
+};
+
+class EXECU :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        double lsq_height;
+        CoreDynParam  coredynp;
+        RegFU          * rfu;
+        SchedulerU     * scheu;
+    FunctionalUnit * fp_u;
+    FunctionalUnit * exeu;
+    FunctionalUnit * mul;
+        interconnect * int_bypass;
+        interconnect * intTagBypass;
+        interconnect * int_mul_bypass;
+        interconnect * intTag_mul_Bypass;
+        interconnect * fp_bypass;
+        interconnect * fpTagBypass;
+
+        Component  bypass;
+        bool exist;
+
+        EXECU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, double lsq_height_,const CoreDynParam & dyn_p_, bool exist_=true);
+    void computeEnergy(bool is_tdp=true);
+        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~EXECU();
+};
+
+
+class Core :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        InstFetchU * ifu;
+        LoadStoreU * lsu;
+        MemManU    * mmu;
+        EXECU      * exu;
+        RENAMINGU  * rnu;
+    Pipeline   * corepipe;
+    UndiffCore * undiffCore;
+    SharedCache * l2cache;
+    CoreDynParam  coredynp;
+    //full_decoder     inst_decoder;
+    //clock_network    clockNetwork;
+        Core(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_);
+        void set_core_param();
+        void computeEnergy(bool is_tdp=true);
+        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~Core();
+};
+
+#endif /* CORE_H_ */
diff --git a/ext/mcpat/globalvar.h b/ext/mcpat/globalvar.h
new file mode 100644 (file)
index 0000000..9532576
--- /dev/null
@@ -0,0 +1,48 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef GLOBALVAR_H_
+#define GLOBALVAR_H_
+
+#ifdef  GLOBALVAR
+#define EXTERN
+#else
+#define EXTERN extern
+#endif
+
+EXTERN bool opt_for_clk;
+
+#endif /* GLOBALVAR_H_ */
+
+
+
+
diff --git a/ext/mcpat/interconnect.cc b/ext/mcpat/interconnect.cc
new file mode 100644 (file)
index 0000000..ba502b6
--- /dev/null
@@ -0,0 +1,222 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include <cassert>
+#include <iostream>
+
+#include "globalvar.h"
+#include "interconnect.h"
+#include "wire.h"
+
+interconnect::interconnect(
+    string name_,
+    enum Device_ty device_ty_,
+        double base_w, double base_h,
+    int data_w, double len,const InputParameter *configure_interface,
+    int start_wiring_level_,
+    bool pipelinable_ ,
+    double route_over_perc_ ,
+    bool opt_local_,
+    enum Core_type core_ty_,
+    enum Wire_type wire_model,
+    double width_s, double space_s,
+    TechnologyParameter::DeviceType *dt
+)
+ :name(name_),
+  device_ty(device_ty_),
+  in_rise_time(0),
+  out_rise_time(0),
+  base_width(base_w),
+  base_height(base_h),
+  data_width(data_w),
+  wt(wire_model),
+  width_scaling(width_s),
+  space_scaling(space_s),
+  start_wiring_level(start_wiring_level_),
+  length(len),
+  //interconnect_latency(1e-12),
+  //interconnect_throughput(1e-12),
+  opt_local(opt_local_),
+  core_ty(core_ty_),
+  pipelinable(pipelinable_),
+  route_over_perc(route_over_perc_),
+  deviceType(dt)
+{
+
+  wt = Global;
+  l_ip=*configure_interface;
+  local_result = init_interface(&l_ip);
+
+
+  max_unpipelined_link_delay = 0; //TODO
+  min_w_nmos = g_tp.min_w_nmos_;
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
+
+
+
+  latency               = l_ip.latency;
+  throughput            = l_ip.throughput;
+  latency_overflow=false;
+  throughput_overflow=false;
+
+  /*
+   * TODO: Add wiring option from semi-global to global automatically
+   * And directly jump to global if semi-global cannot satisfy timing
+   * Fat wires only available for global wires, thus
+   * if signal wiring layer starts from semi-global,
+   * the next layer up will be global, i.e., semi-global does
+   * not have fat wires.
+   */
+  if (pipelinable == false)
+  //Non-pipelinable wires, such as bypass logic, care latency
+  {
+          compute();
+          if (opt_for_clk && opt_local)
+          {
+                  while (delay > latency && width_scaling<3.0)
+                  {
+                          width_scaling *= 2;
+                          space_scaling *= 2;
+                          Wire winit(width_scaling, space_scaling);
+                          compute();
+                  }
+                  if (delay > latency)
+                  {
+                          latency_overflow=true;
+                  }
+          }
+  }
+  else //Pipelinable wires, such as bus, does not care latency but throughput
+  {
+          /*
+           * TODO: Add pipe regs power, area, and timing;
+           * Pipelinable wires optimize latency first.
+           */
+          compute();
+          if (opt_for_clk && opt_local)
+          {
+                  while (delay > throughput && width_scaling<3.0)
+                  {
+                          width_scaling *= 2;
+                          space_scaling *= 2;
+                          Wire winit(width_scaling, space_scaling);
+                          compute();
+                  }
+                  if (delay > throughput)
+                          // insert pipeline stages
+                  {
+                          num_pipe_stages = (int)ceil(delay/throughput);
+                          assert(num_pipe_stages>0);
+                          delay = delay/num_pipe_stages + num_pipe_stages*0.05*delay;
+                  }
+          }
+  }
+
+  power_bit = power;
+  power.readOp.dynamic *= data_width;
+  power.readOp.leakage *= data_width;
+  power.readOp.gate_leakage *= data_width;
+  area.set_area(area.get_area()*data_width);
+  no_device_under_wire_area.h *= data_width;
+
+  if (latency_overflow==true)
+                cout<< "Warning: "<< name <<" wire structure cannot satisfy latency constraint." << endl;
+
+
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+
+  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  power.readOp.longer_channel_leakage =
+          power.readOp.leakage*long_channel_device_reduction;
+
+  if (pipelinable)//Only global wires has the option to choose whether routing over or not
+          area.set_area(area.get_area()*route_over_perc + no_device_under_wire_area.get_area()*(1-route_over_perc));
+
+  Wire wreset();
+}
+
+
+
+void
+interconnect::compute()
+{
+
+  Wire *wtemp1 = 0;
+  wtemp1 = new Wire(wt, length, 1, width_scaling, space_scaling);
+  delay = wtemp1->delay;
+  power.readOp.dynamic = wtemp1->power.readOp.dynamic;
+  power.readOp.leakage = wtemp1->power.readOp.leakage;
+  power.readOp.gate_leakage = wtemp1->power.readOp.gate_leakage;
+
+  area.set_area(wtemp1->area.get_area());
+  no_device_under_wire_area.h =  (wtemp1->wire_width + wtemp1->wire_spacing);
+  no_device_under_wire_area.w = length;
+
+  if (wtemp1)
+   delete wtemp1;
+
+}
+
+void interconnect::leakage_feedback(double temperature)
+{
+  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
+  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+
+  compute();
+
+  power_bit = power;
+  power.readOp.dynamic *= data_width;
+  power.readOp.leakage *= data_width;
+  power.readOp.gate_leakage *= data_width;
+
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+
+  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
+}
+
diff --git a/ext/mcpat/interconnect.h b/ext/mcpat/interconnect.h
new file mode 100644 (file)
index 0000000..4cf42da
--- /dev/null
@@ -0,0 +1,111 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __INTERCONNECT_H__
+#define __INTERCONNECT_H__
+
+#include "assert.h"
+#include "basic_circuit.h"
+#include "basic_components.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "parameter.h"
+#include "subarray.h"
+#include "wire.h"
+
+// leakge power includes entire htree in a bank (when uca_tree == false)
+// leakge power includes only part to one bank when uca_tree == true
+
+class interconnect : public Component
+{
+  public:
+    interconnect(
+        string  name_,
+        enum Device_ty device_ty_,
+        double base_w, double base_h, int data_w, double len,
+        const InputParameter *configure_interface, int start_wiring_level_,
+        bool pipelinable_ = false,
+        double route_over_perc_ =0.5,
+        bool opt_local_=true,
+        enum Core_type core_ty_=Inorder,
+        enum Wire_type wire_model=Global,
+        double width_s=1.0, double space_s=1.0,
+        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
+                );
+
+    ~interconnect() {};
+
+    void compute();
+        string   name;
+        enum Device_ty device_ty;
+    double in_rise_time, out_rise_time;
+        InputParameter l_ip;
+        uca_org_t local_result;
+    Area no_device_under_wire_area;
+    void set_in_rise_time(double rt)
+    {
+      in_rise_time = rt;
+    }
+
+    void leakage_feedback(double temperature);
+    double max_unpipelined_link_delay;
+    powerDef power_bit;
+
+    double wire_bw;
+    double init_wire_bw;  // bus width at root
+    double base_width;
+    double base_height;
+    int data_width;
+    enum Wire_type wt;
+    double width_scaling, space_scaling;
+    int start_wiring_level;
+    double length;
+    double min_w_nmos;
+    double min_w_pmos;
+    double latency, throughput;
+    bool  latency_overflow;
+    bool  throughput_overflow;
+    double  interconnect_latency;
+    double  interconnect_throughput;
+    bool opt_local;
+    enum Core_type core_ty;
+    bool pipelinable;
+    double route_over_perc;
+    int  num_pipe_stages;
+
+  private:
+    TechnologyParameter::DeviceType *deviceType;
+
+};
+
+#endif
+
diff --git a/ext/mcpat/iocontrollers.cc b/ext/mcpat/iocontrollers.cc
new file mode 100644 (file)
index 0000000..70b0f2d
--- /dev/null
@@ -0,0 +1,446 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "XML_Parse.h"
+#include "basic_circuit.h"
+#include "basic_components.h"
+#include "const.h"
+#include "io.h"
+#include "iocontrollers.h"
+#include "logic.h"
+#include "parameter.h"
+
+/*
+SUN Niagara 2 I/O power analysis:
+total signal bits: 711
+Total FBDIMM bits: (14+10)*2*8= 384
+PCIe bits:         (8 + 8)*2 = 32
+10Gb NIC:          (4*2+4*2)*2 = 32
+Debug I/Os:        168
+Other I/Os:        711- 32-32 - 384 - 168 = 95
+
+According to "Implementation of an 8-Core, 64-Thread, Power-Efficient SPARC Server on a Chip"
+90% of I/Os are SerDers (the calucaltion is 384+64/(711-168)=83% about the same as the 90% reported in the paper)
+--> around 80Pins are common I/Os.
+Common I/Os consumes 71mW/Gb/s according to Cadence ChipEstimate @65nm
+Niagara 2 I/O clock is 1/4 of core clock. --> 87pin (<--((711-168)*17%)) * 71mW/Gb/s *0.25*1.4Ghz = 2.17W
+
+Total dynamic power of FBDIMM, NIC, PCIe = 84*0.132 + 84*0.049*0.132 = 11.14 - 2.17 = 8.98
+Further, if assuming I/O logic power is about 50% of I/Os then Total energy of FBDIMM, NIC, PCIe = 11.14 - 2.17*1.5 = 7.89
+ */
+
+/*
+ * A bug in Cadence ChipEstimator: After update the clock rate in the clock tab, a user
+ * need to re-select the IP clock (the same clk) and then click Estimate. if not reselect
+ * the new clock rate may not be propogate into the IPs.
+ *
+ */
+
+NIUController::NIUController(ParseXML *XML_interface,InputParameter* interface_ip_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_)
+ {
+          local_result = init_interface(&interface_ip);
+
+          double frontend_area, phy_area, mac_area, SerDer_area;
+      double frontend_dyn, mac_dyn, SerDer_dyn;
+      double frontend_gates, mac_gates, SerDer_gates;
+          double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+          double NMOS_sizing, PMOS_sizing;
+
+          set_niu_param();
+
+          if (niup.type == 0) //high performance NIU
+          {
+                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate using 65nm.
+                  mac_area = (1.53 + 0.3)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Area estimation based on average of die photo from Niagara 2, ISSCC "An 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS"
+                  //and"A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique" Frontend is PCS
+                  frontend_area = (9.8 + (6 + 18)*65/130*65/130)/3 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
+                  //SerDer is very hard to scale
+                  SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
+                  phy_area = frontend_area + SerDer_area;
+                  //total area
+                  area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
+                  //Power
+                  //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
+                  mac_dyn      = 2.19e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
+                  //Cadence ChipEstimate using 65nm soft IP;
+                  frontend_dyn = 0.27e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
+                  //according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006
+                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+                  SerDer_dyn   = 0.01*10*sqrt(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  SerDer_dyn   /= niup.clockRate;//covert to energy per clock cycle of whole NIU
+
+                  //Cadence ChipEstimate using 65nm
+                  mac_gates       = 111700;
+                  frontend_gates  = 320000;
+                  SerDer_gates    = 200000;
+                  NMOS_sizing    = 5*g_tp.min_w_nmos_;
+                  PMOS_sizing    = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+
+
+          }
+          else
+          {//Low power implementations are mostly from Cadence ChipEstimator; Ignore the multiple IP effect
+                  // ---When there are multiple IP (same kind or not) selected, Cadence ChipEstimator results are not
+                  // a simple summation of all IPs. Ignore this effect
+                  mac_area      = 0.24 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  frontend_area = 0.1  * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);//Frontend is the PCS layer
+                  SerDer_area   = 0.35 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique"
+                  //and the ChipEstimator XAUI PHY hard IP, confirm that even PHY can scale perfectly with the technology
+                  //total area
+                  area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
+                  //Power
+                  //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
+                  mac_dyn      = 1.257e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
+                  //Cadence ChipEstimate using 65nm soft IP;
+                  frontend_dyn = 0.6e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
+                  //SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm
+                  SerDer_dyn   = 0.0216*10*(interface_ip.F_sz_um/0.13)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  SerDer_dyn   /= niup.clockRate;//covert to energy per clock cycle of whole NIU
+
+                  mac_gates       = 111700;
+                  frontend_gates  = 52000;
+                  SerDer_gates    = 199260;
+
+                  NMOS_sizing    = g_tp.min_w_nmos_;
+                  PMOS_sizing    = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+
+          }
+
+          power_t.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn;
+          power_t.readOp.leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+          double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+          power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+          power_t.readOp.gate_leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+ }
+
+void NIUController::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+    {
+
+
+                power  = power_t;
+        power.readOp.dynamic *= niup.duty_cycle;
+
+    }
+    else
+    {
+        rt_power = power_t;
+        rt_power.readOp.dynamic *= niup.perc_load;
+    }
+}
+
+void NIUController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << "NIU:" << endl;
+                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*niup.clockRate  << " W" << endl;
+                cout << indent_str<< "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*niup.clockRate << " W" << endl;
+                cout<<endl;
+        }
+        else
+        {
+
+        }
+
+}
+
+void NIUController::set_niu_param()
+{
+          niup.clockRate       = XML->sys.niu.clockrate;
+          niup.clockRate       *= 1e6;
+          niup.num_units       = XML->sys.niu.number_units;
+          niup.duty_cycle      = XML->sys.niu.duty_cycle;
+          niup.perc_load       = XML->sys.niu.total_load_perc;
+          niup.type            = XML->sys.niu.type;
+//       niup.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+}
+
+PCIeController::PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_)
+ {
+          local_result = init_interface(&interface_ip);
+          double frontend_area, phy_area, ctrl_area, SerDer_area;
+      double ctrl_dyn, frontend_dyn, SerDer_dyn;
+      double ctrl_gates,frontend_gates, SerDer_gates;
+          double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+          double NMOS_sizing, PMOS_sizing;
+
+          /* Assuming PCIe is bit-slice based architecture
+           * This is the reason for /8 in both area and power calculation
+           * to get per lane numbers
+           */
+
+          set_pcie_param();
+          if (pciep.type == 0) //high performance NIU
+          {
+                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate @ 65nm.
+                  ctrl_area = (5.2 + 0.5)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
+                  frontend_area = (5.2 + 0.1)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
+                  //SerDer is very hard to scale
+                  SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
+                  phy_area = frontend_area + SerDer_area;
+                  //total area
+                  //Power
+                  //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
+                  ctrl_dyn      = 3.75e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+                  //     //Cadence ChipEstimate using 65nm soft IP;
+                  //     frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+                  SerDer_dyn   = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
+                  SerDer_dyn   /= pciep.clockRate;//covert to energy per clock cycle
+
+                  //power_t.readOp.dynamic = (ctrl_dyn)*pciep.num_channels;
+                  //Cadence ChipEstimate using 65nm
+                  ctrl_gates       = 900000/8*pciep.num_channels;
+                  //     frontend_gates   = 120000/8;
+                  //     SerDer_gates     = 200000/8;
+                  NMOS_sizing    = 5*g_tp.min_w_nmos_;
+                  PMOS_sizing    = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+          }
+          else
+          {
+                  ctrl_area = 0.412 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
+          SerDer_area = 0.36 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //total area
+                  //Power
+                  //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
+                  ctrl_dyn      = 2.21e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+                  //     //Cadence ChipEstimate using 65nm soft IP;
+                  //     frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+                  SerDer_dyn   = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
+                  SerDer_dyn   /= pciep.clockRate;//covert to energy per clock cycle
+
+                  //Cadence ChipEstimate using 65nm
+                  ctrl_gates       = 200000/8*pciep.num_channels;
+                  //     frontend_gates   = 120000/8;
+                  SerDer_gates     = 200000/8*pciep.num_channels;
+                  NMOS_sizing    = g_tp.min_w_nmos_;
+                  PMOS_sizing    = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+
+          }
+          area.set_area(((ctrl_area + (pciep.withPHY? SerDer_area:0))/8*pciep.num_channels)*1e6);
+          power_t.readOp.dynamic = (ctrl_dyn + (pciep.withPHY? SerDer_dyn:0))*pciep.num_channels;
+          power_t.readOp.leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+          double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+          power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+          power_t.readOp.gate_leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+ }
+
+void PCIeController::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+    {
+
+
+                power  = power_t;
+        power.readOp.dynamic *= pciep.duty_cycle;
+
+    }
+    else
+    {
+        rt_power = power_t;
+        rt_power.readOp.dynamic *= pciep.perc_load;
+    }
+}
+
+void PCIeController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << "PCIe:" << endl;
+                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*pciep.clockRate  << " W" << endl;
+                cout << indent_str<< "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*pciep.clockRate << " W" << endl;
+                cout<<endl;
+        }
+        else
+        {
+
+        }
+
+}
+
+void PCIeController::set_pcie_param()
+{
+          pciep.clockRate       = XML->sys.pcie.clockrate;
+          pciep.clockRate       *= 1e6;
+          pciep.num_units       = XML->sys.pcie.number_units;
+          pciep.num_channels    = XML->sys.pcie.num_channels;
+          pciep.duty_cycle      = XML->sys.pcie.duty_cycle;
+          pciep.perc_load       = XML->sys.pcie.total_load_perc;
+          pciep.type            = XML->sys.pcie.type;
+          pciep.withPHY         = XML->sys.pcie.withPHY;
+//       pciep.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+
+}
+
+FlashController::FlashController(ParseXML *XML_interface,InputParameter* interface_ip_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_)
+ {
+          local_result = init_interface(&interface_ip);
+          double frontend_area, phy_area, ctrl_area, SerDer_area;
+      double ctrl_dyn, frontend_dyn, SerDer_dyn;
+      double ctrl_gates,frontend_gates, SerDer_gates;
+          double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+          double NMOS_sizing, PMOS_sizing;
+
+          /* Assuming PCIe is bit-slice based architecture
+           * This is the reason for /8 in both area and power calculation
+           * to get per lane numbers
+           */
+
+          set_fc_param();
+          if (fcp.type == 0) //high performance NIU
+          {
+                  cout<<"Current McPAT does not support high performance flash contorller since even low power designs are enough for maintain throughput"<<endl;
+                  exit(0);
+                  NMOS_sizing    = 5*g_tp.min_w_nmos_;
+                  PMOS_sizing    = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+          }
+          else
+          {
+                  ctrl_area   = 0.243 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+                  //Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL from CAST
+          SerDer_area = 0.36/8 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
+          //based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it support 8x lanes with each lane
+          //speed up to 250MB/s (PCIe1.1x) This is already saturate the 200MB/s of the flash controller core above.
+                  ctrl_gates      = 129267;
+                  SerDer_gates    = 200000/8;
+                  NMOS_sizing    = g_tp.min_w_nmos_;
+                  PMOS_sizing    = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+
+                  //Power
+                  //Cadence ChipEstimate using 65nm the controller 125mW for every 200MB/s This is power not energy!
+                  ctrl_dyn      = 0.125*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+                  SerDer_dyn   = 0.01*1.6*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  //max  Per controller speed is 1.6Gb/s (200MB/s)
+          }
+          double number_channel = 1+(fcp.num_channels-1)*0.2;
+          area.set_area((ctrl_area + (fcp.withPHY? SerDer_area:0))*1e6*number_channel);
+          power_t.readOp.dynamic = (ctrl_dyn + (fcp.withPHY? SerDer_dyn:0))*number_channel;
+          power_t.readOp.leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+          double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+          power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+          power_t.readOp.gate_leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+ }
+
+void FlashController::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+    {
+
+
+                power  = power_t;
+        power.readOp.dynamic *= fcp.duty_cycle;
+
+    }
+    else
+    {
+        rt_power = power_t;
+        rt_power.readOp.dynamic *= fcp.perc_load;
+    }
+}
+
+void FlashController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << "Flash Controller:" << endl;
+                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;//no multiply of clock since this is power already
+                cout << indent_str<< "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
+                cout<<endl;
+        }
+        else
+        {
+
+        }
+
+}
+
+void FlashController::set_fc_param()
+{
+//       fcp.clockRate       = XML->sys.flashc.mc_clock;
+//       fcp.clockRate       *= 1e6;
+          fcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
+          fcp.num_channels    = ceil(fcp.peakDataTransferRate/200);
+          fcp.num_mcs         = XML->sys.flashc.number_mcs;
+          fcp.duty_cycle      = XML->sys.flashc.duty_cycle;
+          fcp.perc_load       = XML->sys.flashc.total_load_perc;
+          fcp.type            = XML->sys.flashc.type;
+          fcp.withPHY         = XML->sys.flashc.withPHY;
+//       flashcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+
+}
diff --git a/ext/mcpat/iocontrollers.h b/ext/mcpat/iocontrollers.h
new file mode 100644 (file)
index 0000000..818580a
--- /dev/null
@@ -0,0 +1,87 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#ifndef IOCONTROLLERS_H_
+#define IOCONTROLLERS_H_
+
+
+#endif /* IOCONTROLLERS_H_ */
+
+#include "XML_Parse.h"
+#include "parameter.h"
+//#include "io.h"
+#include "array.h"
+//#include "Undifferentiated_Core_Area.h"
+#include <vector>
+
+#include "basic_components.h"
+
+class NIUController : public Component {
+  public:
+        ParseXML *XML;
+        InputParameter interface_ip;
+    NIUParam  niup;
+    powerDef power_t;
+    uca_org_t local_result;
+    NIUController(ParseXML *XML_interface,InputParameter* interface_ip_);
+    void set_niu_param();
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~NIUController(){};
+};
+
+class PCIeController : public Component {
+  public:
+        ParseXML *XML;
+        InputParameter interface_ip;
+    PCIeParam  pciep;
+    powerDef power_t;
+    uca_org_t local_result;
+    PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_);
+    void set_pcie_param();
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~PCIeController(){};
+};
+
+class FlashController : public Component {
+  public:
+        ParseXML *XML;
+        InputParameter interface_ip;
+    MCParam  fcp;
+    powerDef power_t;
+    uca_org_t local_result;
+    FlashController(ParseXML *XML_interface,InputParameter* interface_ip_);
+    void set_fc_param();
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~FlashController(){};
+};
+
diff --git a/ext/mcpat/logic.cc b/ext/mcpat/logic.cc
new file mode 100644 (file)
index 0000000..11519d8
--- /dev/null
@@ -0,0 +1,1014 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "logic.h"
+
+
+//selection_logic
+selection_logic::selection_logic(
+    bool   _is_default,
+    int    win_entries_,
+    int    issue_width_,
+    const InputParameter *configure_interface,
+    enum Device_ty device_ty_,
+    enum Core_type core_ty_)
+    //const ParseXML *_XML_interface)
+ :is_default(_is_default),
+  win_entries(win_entries_),
+  issue_width(issue_width_),
+  device_ty(device_ty_),
+  core_ty(core_ty_)
+ {
+        //uca_org_t result2;
+        l_ip=*configure_interface;
+        local_result = init_interface(&l_ip);
+        //init_tech_params(l_ip.F_sz_um, false);
+        //win_entries=numIBEntries;//IQentries;
+                //issue_width=issueWidth;
+        selection_power();
+        double sckRation = g_tp.sckt_co_eff;
+        power.readOp.dynamic *= sckRation;
+        power.writeOp.dynamic *= sckRation;
+        power.searchOp.dynamic *= sckRation;
+
+        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+        power.readOp.longer_channel_leakage    = power.readOp.leakage*long_channel_device_reduction;
+         }
+
+void selection_logic::selection_power()
+{//based on cost effective superscalar processor TR pp27-31
+  double Ctotal, Cor, Cpencode;
+  int num_arbiter;
+  double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
+
+  //TODO: the 0.8um process data is used.
+  WSelORn      =       12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
+  WSelORprequ   =      50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process
+  WSelPn       =       12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process
+  WSelPp       =       18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process
+  WSelEnn      =       6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process
+  WSelEnp      =       12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
+
+
+  Ctotal=0;
+  num_arbiter=1;
+  while(win_entries > 4)
+    {
+      win_entries = (int)ceil((double)win_entries / 4.0);
+      num_arbiter += win_entries;
+    }
+  //the 4-input OR logic to generate anyreq
+  Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def);
+  power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd;
+
+  //The total capacity of the 4-bit priority encoder
+  Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) +
+    2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) +
+    3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) +
+    4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic
+    2*4*gate_C(WSelEnn+WSelEnp,20.0)+
+    4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic
+    (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal
+
+  Ctotal += issue_width * num_arbiter*(Cor+Cpencode);
+
+  power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip
+  power.readOp.leakage = issue_width * num_arbiter *
+      (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
+       + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
+       + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
+       + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
+       + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
+                  )*g_tp.peri_global.Vdd;
+  power.readOp.gate_leakage = issue_width * num_arbiter *
+      (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
+       + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
+       + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
+       + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
+       + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
+        )*g_tp.peri_global.Vdd;
+}
+
+
+dep_resource_conflict_check::dep_resource_conflict_check(
+        const InputParameter *configure_interface,
+        const CoreDynParam & dyn_p_,
+        int   compare_bits_,
+    bool   _is_default)
+ :  l_ip(*configure_interface),
+    coredynp(dyn_p_),
+    compare_bits(compare_bits_),
+        is_default(_is_default)
+{
+        Wcompn         =       25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
+        Wevalinvp   =  25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
+        Wevalinvn   =  100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process
+        Wcomppreequ =          50 * l_ip.F_sz_um;//this was 40.0  micron for the 0.8 micron process
+        WNORn          =       6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process
+        WNORp          =       38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process
+
+        local_result = init_interface(&l_ip);
+
+        if (coredynp.core_ty==Inorder)
+                    compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator
+        else
+                compare_bits += 16 + 8 + 8;
+
+                conflict_check_power();
+        double sckRation = g_tp.sckt_co_eff;
+        power.readOp.dynamic *= sckRation;
+        power.writeOp.dynamic *= sckRation;
+        power.searchOp.dynamic *= sckRation;
+
+}
+
+void dep_resource_conflict_check::conflict_check_power()
+{
+        double Ctotal;
+        int num_comparators;
+        num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
+        //When decode-width ==1, no dcl logic
+
+        Ctotal = num_comparators * compare_cap();
+        //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);
+
+        power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/;
+        power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);
+
+        double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+        power.readOp.longer_channel_leakage    = power.readOp.leakage*long_channel_device_reduction;
+        power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
+
+}
+
+/* estimate comparator power consumption (this comparator is similar
+   to the tag-match structure in a CAM */
+double dep_resource_conflict_check::compare_cap()
+{
+  double c1, c2;
+
+  WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in.
+  /* bottom part of comparator */
+  c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+
+  drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def);
+
+  /* top part of comparator */
+  c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+
+                  drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) +  gate_C(WNORn + WNORp,10.0) +
+                  drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def);
+  return(c1 + c2);
+
+}
+
+void dep_resource_conflict_check::leakage_feedback(double temperature)
+{
+  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
+  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+
+  // This is part of conflict_check_power()
+  int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
+  power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);
+
+  double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+  power.readOp.longer_channel_leakage  = power.readOp.leakage*long_channel_device_reduction;
+  power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
+}
+
+//TODO: add inverter and transmission gate base DFF.
+
+DFFCell::DFFCell(
+                bool _is_dram,
+                double _WdecNANDn,
+                double _WdecNANDp,
+                double _cell_load,
+                const InputParameter *configure_interface)
+:is_dram(_is_dram),
+cell_load(_cell_load),
+WdecNANDn(_WdecNANDn),
+WdecNANDp(_WdecNANDp)
+{//this model is based on the NAND2 based DFF.
+                        l_ip=*configure_interface;
+//                     area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um);
+                        area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def)
+                                + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def));
+
+
+}
+
+
+double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out)
+{
+  double Ctotal = 0;
+  //printf("WdecNANDn = %E\n", WdecNANDn);
+
+  /* part 1: drain cap of NAND gate */
+  Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
+
+  /* part 2: gate cap of NAND gates */
+  Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
+
+  return Ctotal;
+}
+
+
+void DFFCell::compute_DFF_cell()
+{
+        double c1, c2, c3, c4, c5, c6;
+           /* node 5 and node 6 are identical to node 1 in capacitance */
+           c1 = c5 = c6 = fpfp_node_cap(2, 1);
+           c2 = fpfp_node_cap(2, 3);
+           c3 = fpfp_node_cap(3, 2);
+           c4 = fpfp_node_cap(2, 2);
+
+           //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
+           clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
+           e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
+
+           /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
+           e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
+           e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
+           e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
+
+           /* static power */
+           e_switch.readOp.leakage +=  (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
+                                           + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
+           e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
+                                           + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
+           //printf("leakage =%E\n",cmos_Ileak(1, is_dram) );
+}
+
+Pipeline::Pipeline(
+                const InputParameter *configure_interface,
+                const CoreDynParam & dyn_p_,
+                enum Device_ty device_ty_,
+                bool _is_core_pipeline,
+                bool _is_default)
+: l_ip(*configure_interface),
+  coredynp(dyn_p_),
+  device_ty(device_ty_),
+  is_core_pipeline(_is_core_pipeline),
+  is_default(_is_default),
+  num_piperegs(0.0)
+
+  {
+        local_result = init_interface(&l_ip);
+        if (!coredynp.Embedded)
+                process_ind = true;
+        else
+                process_ind = false;
+        WNANDn = (process_ind)? 25 *   l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was  20 micron for the 0.8 micron process
+        WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was  30 micron for the 0.8 micron process
+        load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false);
+        compute();
+
+}
+
+void Pipeline::compute()
+{
+        compute_stage_vector();
+        DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip);
+        pipe_reg.compute_DFF_cell();
+
+        double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
+        //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
+        //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
+        double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg;
+        double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
+        double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage;
+        power.readOp.dynamic   +=pipe_reg_power;
+        power.readOp.leakage   +=pipe_reg_leakage;
+        power.readOp.gate_leakage      +=pipe_reg_gate_leakage;
+        area.set_area(num_piperegs * pipe_reg.area.get_area());
+
+        double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty);
+        power.readOp.longer_channel_leakage    = power.readOp.leakage*long_channel_device_reduction;
+
+
+        double sckRation = g_tp.sckt_co_eff;
+        power.readOp.dynamic *= sckRation;
+        power.writeOp.dynamic *= sckRation;
+        power.searchOp.dynamic *= sckRation;
+        double macro_layout_overhead = g_tp.macro_layout_overhead;
+        if (!coredynp.Embedded)
+                area.set_area(area.get_area()*macro_layout_overhead);
+}
+
+void Pipeline::compute_stage_vector()
+{
+        double num_stages, tot_stage_vector, per_stage_vector;
+        int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length;
+        //Hthread = thread_clock_gated? 1:num_thread;
+
+  if (!is_core_pipeline)
+  {
+        num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput
+  }
+  else
+  {
+        if (coredynp.core_ty==Inorder)
+        {
+                /* assume 6 pipe stages and try to estimate bits per pipe stage */
+                /* pipe stage 0/IF */
+                num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads;
+                /* pipe stage IF/ID */
+                num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;
+                /* pipe stage IF/ThreadSEL */
+                if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states
+                /* pipe stage ID/EXE */
+                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads;
+                /* pipe stage EXE/MEM */
+                num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
+                /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
+                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
+//             /* pipe stage 5/6 */
+//             num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
+//             /* pipe stage 6/7 */
+//             num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
+//             /* pipe stage 7/8 */
+//             num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/);
+//             /* assume 50% extra in control signals (rule of thumb) */
+                num_stages=6;
+
+        }
+        else
+        {
+                /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
+                /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
+
+                /* pipe stage 0/1F*/
+                num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC
+                /* pipe stage IF/ID */
+                num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID
+                /* pipe stage 1D/Renaming*/
+                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage.
+                /* pipe stage Renaming/wire_drive */
+                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width);
+                /* pipe stage Renaming/IssueQ */
+                num_piperegs += coredynp.issueW*(coredynp.instruction_length  + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest
+                /* pipe stage IssueQ/Dispatch */
+                num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
+                /* pipe stage Dispatch/EXE */
+
+                num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
+                /* 2^opcode_length means the total decoded signal for the opcode*/
+                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
+                /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
+                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
+                /* pipe stage EXE/MEM, data need to be read/write, address*/
+                num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed
+                /* pipe stage MEM/WB; result data, writeback regs */
+                num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/);
+                /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
+                num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads;
+//             if (multithreaded)
+//             {
+//
+//             }
+                num_stages=12;
+
+        }
+
+        /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
+        num_piperegs = num_piperegs * 1.5;
+        tot_stage_vector=num_piperegs;
+        per_stage_vector=tot_stage_vector/num_stages;
+
+        if (coredynp.core_ty==Inorder)
+        {
+                if (coredynp.pipeline_stages>6)
+                        num_piperegs= per_stage_vector*coredynp.pipeline_stages;
+        }
+        else//OOO
+        {
+                if (coredynp.pipeline_stages>12)
+                        num_piperegs= per_stage_vector*coredynp.pipeline_stages;
+        }
+  }
+
+}
+
+FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ fu_type(fu_type_)
+{
+    double area_t;//, leakage, gate_leakage;
+    double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+        clockRate = coredynp.clockRate;
+        executionTime = coredynp.executionTime;
+
+        //XML_interface=_XML_interface;
+        uca_org_t result2;
+        result2 = init_interface(&interface_ip);
+        if (XML->sys.Embedded)
+        {
+                if (fu_type == FPU)
+                {
+                        num_fu=coredynp.num_fpus;
+                        //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
+                        area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
+                        //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
+                        if (g_ip->F_sz_nm>90)
+                                area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
+                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
+//                     base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
+//                     base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+                        base_energy = 0;
+                        per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
+                        //FPU power from Sandia's processor sizing tech report
+                        FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
+                }
+                else if (fu_type == ALU)
+                {
+                        num_fu=coredynp.num_alus;
+                        area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
+                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+//                     base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
+//                     base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+                        base_energy = 0;
+                        per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
+                        FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
+
+                }
+                else if (fu_type == MUL)
+                {
+                        num_fu=coredynp.num_muls;
+                        area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
+                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+//                     base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
+//                     base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+                        base_energy = 0;
+                        per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
+                        FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
+                }
+                else
+                {
+                        cout<<"Unknown Functional Unit Type"<<endl;
+                        exit(0);
+                }
+                per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
+        }
+        else
+        {
+                if (fu_type == FPU)
+                {
+                        num_fu=coredynp.num_fpus;
+                        //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
+                        area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2
+                        if (g_ip->F_sz_nm>90)
+                                area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
+                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
+                        base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
+                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+                        per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
+                        FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
+                }
+                else if (fu_type == ALU)
+                {
+                        num_fu=coredynp.num_alus;
+                        area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
+                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+                        base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
+                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+                        per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
+                        FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
+
+                }
+                else if (fu_type == MUL)
+                {
+                        num_fu=coredynp.num_muls;
+                        area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
+                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+                        base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
+                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+                        per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
+                        FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
+                }
+                else
+                {
+                        cout<<"Unknown Functional Unit Type"<<endl;
+                        exit(0);
+                }
+        }
+        //IEXEU, simple ALU and FPU
+        //  double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation.
+        //
+        //  C_ALU        = 0.025e-9;//F
+        //  C_EXEU  = 0.05e-9; //F
+        //  C_FPU        = 0.35e-9;//F
+    area.set_area(area_t*num_fu);
+    leakage *= num_fu;
+    gate_leakage *=num_fu;
+        double macro_layout_overhead = g_tp.macro_layout_overhead;
+//     if (!XML->sys.Embedded)
+                area.set_area(area.get_area()*macro_layout_overhead);
+}
+
+void FunctionalUnit::computeEnergy(bool is_tdp)
+{
+        double pppm_t[4]    = {1,1,1,1};
+        double FU_duty_cycle;
+        if (is_tdp)
+        {
+
+
+                set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction.
+                if (fu_type == FPU)
+                {
+                        stats_t.readAc.access = num_fu;
+                        tdp_stats = stats_t;
+                        FU_duty_cycle = coredynp.FPU_duty_cycle;
+                }
+                else if (fu_type == ALU)
+                {
+                        stats_t.readAc.access = 1*num_fu;
+                        tdp_stats = stats_t;
+                        FU_duty_cycle = coredynp.ALU_duty_cycle;
+                }
+                else if (fu_type == MUL)
+                {
+                        stats_t.readAc.access = num_fu;
+                        tdp_stats = stats_t;
+                        FU_duty_cycle = coredynp.MUL_duty_cycle;
+                }
+
+            //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access;
+            power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate;
+                double sckRation = g_tp.sckt_co_eff;
+                power.readOp.dynamic *= sckRation*FU_duty_cycle;
+                power.writeOp.dynamic *= sckRation;
+                power.searchOp.dynamic *= sckRation;
+
+            power.readOp.leakage = leakage;
+            power.readOp.gate_leakage = gate_leakage;
+            double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+            power.readOp.longer_channel_leakage        = power.readOp.leakage*long_channel_device_reduction;
+
+        }
+        else
+        {
+                if (fu_type == FPU)
+                {
+                        stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses;
+                        rtp_stats = stats_t;
+                }
+                else if (fu_type == ALU)
+                {
+                        stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses;
+                        rtp_stats = stats_t;
+                }
+                else if (fu_type == MUL)
+                {
+                        stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses;
+                        rtp_stats = stats_t;
+                }
+
+            //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access;
+            rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime;
+                double sckRation = g_tp.sckt_co_eff;
+                rt_power.readOp.dynamic *= sckRation;
+                rt_power.writeOp.dynamic *= sckRation;
+                rt_power.searchOp.dynamic *= sckRation;
+
+        }
+
+
+}
+
+void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+//     cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl;
+        if (is_tdp)
+        {
+                if (fu_type == FPU)
+                {
+                        cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl;
+                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
+//                     cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next<< "Subthreshold Leakage = "
+                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                }
+                else if (fu_type == ALU)
+                {
+                        cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl;
+                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
+//                     cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next<< "Subthreshold Leakage = "
+                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+                }
+                else if (fu_type == MUL)
+                {
+                        cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl;
+                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
+//                     cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
+                        cout << indent_str_next<< "Subthreshold Leakage = "
+                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
+                        cout <<endl;
+
+                }
+
+        }
+        else
+        {
+        }
+
+}
+
+void FunctionalUnit::leakage_feedback(double temperature)
+{
+  // Update the temperature and initialize the global interfaces.
+  interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
+
+  uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy
+
+  // This is part of FunctionalUnit()
+  double area_t, leakage, gate_leakage;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+
+  if (fu_type == FPU)
+  {
+        area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
+        if (g_ip->F_sz_nm>90)
+                area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
+        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+  }
+  else if (fu_type == ALU)
+  {
+    area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
+    leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+    gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+  }
+  else if (fu_type == MUL)
+  {
+    area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
+    leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+    gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+  }
+  else
+  {
+    cout<<"Unknown Functional Unit Type"<<endl;
+    exit(1);
+  }
+
+  power.readOp.leakage = leakage*num_fu;
+  power.readOp.gate_leakage = gate_leakage*num_fu;
+  power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+}
+
+UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_,  bool embedded_)
+:XML(XML_interface),
+ ithCore(ithCore_),
+ interface_ip(*interface_ip_),
+ coredynp(dyn_p_),
+ core_ty(coredynp.core_ty),
+ embedded(XML->sys.Embedded),
+ pipeline_stage(coredynp.pipeline_stages),
+ num_hthreads(coredynp.num_hthreads),
+ issue_width(coredynp.issueW),
+ exist(exist_)
+// is_default(_is_default)
+{
+        if (!exist) return;
+        double undifferentiated_core=0;
+        double core_tx_density=0;
+        double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+        double undifferentiated_core_coe;
+        //XML_interface=_XML_interface;
+        uca_org_t result2;
+        result2 = init_interface(&interface_ip);
+
+        //Compute undifferentiated core area at 90nm.
+        if (embedded==false)
+        {
+                //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
+                if (core_ty==OOO)
+                {
+                        //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO
+                        undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0;
+                }
+                else if (core_ty==Inorder)
+                {
+                        //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder
+                        undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0;
+                }
+                else
+                {
+                        cout<<"invalid core type"<<endl;
+                        exit(0);
+                }
+                undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716);
+        }
+        else
+        {
+                //Based on the results in paper "parametrized processor models" Sandia Labs
+                if (XML->sys.opt_clockrate)
+                        undifferentiated_core_coe = 0.05;
+                else
+                        undifferentiated_core_coe = 0;
+                undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe;
+                undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426);
+        }
+
+        undifferentiated_core              *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2
+        core_tx_density                 = g_tp.scaling_factor.core_tx_density;
+        //undifferentiated_core                    = 3*1e6;
+        //undifferentiated_core                        *= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*;
+        power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+        power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
+
+        double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+        power.readOp.longer_channel_leakage    = power.readOp.leakage*long_channel_device_reduction;
+        area.set_area(undifferentiated_core);
+
+        scktRatio = g_tp.sckt_co_eff;
+        power.readOp.dynamic *= scktRatio;
+        power.writeOp.dynamic *= scktRatio;
+        power.searchOp.dynamic *= scktRatio;
+        macro_PR_overhead = g_tp.macro_layout_overhead;
+        area.set_area(area.get_area()*macro_PR_overhead);
+
+
+
+//             double vt=g_tp.peri_global.Vth;
+//             double velocity_index=1.1;
+//             double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false);
+//             double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in;
+//             double w_nmos=g_tp.min_w_nmos_;
+//             double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+//             double i_on_n=1.0;
+//             double i_on_p=1.0;
+//             double i_on_n_in=1.0;
+//             double i_on_p_in=1;
+//             double vdd=g_tp.peri_global.Vdd;
+
+//             power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
+//             power.readOp.dynamic=c_out*vdd*vdd/2;
+
+//             cout<<power.readOp.dynamic << "dynamic" <<endl;
+//             cout<<power.readOp.sc << "sc" << endl;
+
+//             power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
+//             power.readOp.dynamic=c_out*vdd*vdd/2;
+//
+//             cout<<power.readOp.dynamic << "dynamic" <<endl;
+//             cout<<power.readOp.sc << "sc" << endl;
+
+
+
+}
+
+
+void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << indent_str << "UndiffCore:" << endl;
+                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
+                //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next<< "Subthreshold Leakage = "
+                                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+                cout << indent_str << "UndiffCore:" << endl;
+                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
+                cout <<endl;
+        }
+
+}
+
+inst_decoder::inst_decoder(
+                bool   _is_default,
+                const InputParameter *configure_interface,
+                int opcode_length_,
+                int num_decoders_,
+                bool x86_,
+            enum Device_ty device_ty_,
+            enum Core_type core_ty_)
+:is_default(_is_default),
+ opcode_length(opcode_length_),
+ num_decoders(num_decoders_),
+ x86(x86_),
+ device_ty(device_ty_),
+ core_ty(core_ty_)
+ {
+                        /*
+                         * Instruction decoder is different from n to 2^n decoders
+                         * that are commonly used in row decoders in memory arrays.
+                         * The RISC instruction decoder is typically a very simple device.
+                         * We can decode an instruction by simply
+                         * separating the machine word into small parts using wire slices
+                         * The RISC instruction decoder can be approximate by the n to 2^n decoders,
+                         * although this approximation usually underestimate power since each decoded
+                         * instruction normally has more than 1 active signal.
+                         *
+                         * However, decoding a CISC instruction word is much more difficult
+                         * than the RISC case. A CISC decoder is typically set up as a state machine.
+                         * The machine reads the opcode field to determine
+                         * what type of instruction it is,
+                         * and where the other data values are.
+                         * The instruction word is read in piece by piece,
+                         * and decisions are made at each stage as to
+                         * how the remainder of the instruction word will be read.
+                         * (sequencer and ROM are usually needed)
+                         * An x86 decoder can be even more complex since
+                         * it involve  both decoding instructions into u-ops and
+                         * merge u-ops when doing micro-ops fusion.
+                         */
+                        bool is_dram=false;
+                        double pmos_to_nmos_sizing_r;
+                        double load_nmos_width, load_pmos_width;
+                        double C_driver_load, R_wire_load;
+                        Area cell;
+
+                        l_ip=*configure_interface;
+                        local_result = init_interface(&l_ip);
+                        cell.h =g_tp.cell_h_def;
+                        cell.w =g_tp.cell_h_def;
+
+                        num_decoder_segments = (int)ceil(opcode_length/18.0);
+                        if (opcode_length > 18)        opcode_length = 18;
+                        num_decoded_signals= (int)pow(2.0,opcode_length);
+                        pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+                        load_nmos_width=g_tp.max_w_nmos_ /2;
+                        load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
+                        C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited
+                        R_wire_load   = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
+
+                        final_dec = new Decoder(
+                                        num_decoded_signals,
+                                        false,
+                                        C_driver_load,
+                                        R_wire_load,
+                                        false/*is_fa*/,
+                                        false/*is_dram*/,
+                                        false/*wl_tr*/, //to use peri device
+                                        cell);
+
+                        PredecBlk * predec_blk1 = new PredecBlk(
+                                        num_decoded_signals,
+                                        final_dec,
+                                        0,//Assuming predec and dec are back to back
+                                        0,
+                                        1,//Each Predec only drives one final dec
+                                        false/*is_dram*/,
+                                        true);
+                        PredecBlk * predec_blk2 = new PredecBlk(
+                                        num_decoded_signals,
+                                        final_dec,
+                                        0,//Assuming predec and dec are back to back
+                                        0,
+                                        1,//Each Predec only drives one final dec
+                                        false/*is_dram*/,
+                                        false);
+
+                        PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
+                        PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
+
+                        pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);
+
+                        double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders;
+                        //double w_decoder    = area_decoder / area.get_h();
+                        double area_pre_dec = (predec_blk_drv1->area.get_area() +
+                                        predec_blk_drv2->area.get_area() +
+                                        predec_blk1->area.get_area() +
+                                        predec_blk2->area.get_area())*
+                                        num_decoder_segments*num_decoders;
+                        area.set_area(area.get_area()+ area_decoder + area_pre_dec);
+                        double macro_layout_overhead   = g_tp.macro_layout_overhead;
+                        double chip_PR_overhead        = g_tp.chip_layout_overhead;
+                        area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
+
+                        inst_decoder_delay_power();
+
+                        double sckRation = g_tp.sckt_co_eff;
+                        power.readOp.dynamic *= sckRation;
+                        power.writeOp.dynamic *= sckRation;
+                        power.searchOp.dynamic *= sckRation;
+
+                        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+                        power.readOp.longer_channel_leakage    = power.readOp.leakage*long_channel_device_reduction;
+
+}
+
+void inst_decoder::inst_decoder_delay_power()
+{
+
+        double dec_outrisetime;
+        double inrisetime=0, outrisetime;
+        double pppm_t[4]    = {1,1,1,1};
+        double squencer_passes = x86?2:1;
+
+        outrisetime = pre_dec->compute_delays(inrisetime);
+        dec_outrisetime = final_dec->compute_delays(outrisetime);
+        set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
+    power = power + pre_dec->power*pppm_t;
+    set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
+                num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
+    power = power + final_dec->power*pppm_t;
+}
+void inst_decoder::leakage_feedback(double temperature)
+{
+  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
+  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+
+  final_dec->leakage_feedback(temperature);
+  pre_dec->leakage_feedback(temperature);
+
+  double pppm_t[4]    = {1,1,1,1};
+  double squencer_passes = x86?2:1;
+
+  set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
+  power = pre_dec->power*pppm_t;
+
+  set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
+  power = power + final_dec->power*pppm_t;
+
+  double sckRation = g_tp.sckt_co_eff;
+
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
+  power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
+}
+
+inst_decoder::~inst_decoder()
+{
+          local_result.cleanup();
+
+          delete final_dec;
+
+          delete pre_dec->blk1;
+          delete pre_dec->blk2;
+          delete pre_dec->drv1;
+          delete pre_dec->drv2;
+          delete pre_dec;
+}
diff --git a/ext/mcpat/logic.h b/ext/mcpat/logic.h
new file mode 100644 (file)
index 0000000..e2a35e8
--- /dev/null
@@ -0,0 +1,233 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#ifndef LOGIC_H_
+#define LOGIC_H_
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+
+#include "XML_Parse.h"
+#include "arch_const.h"
+#include "basic_circuit.h"
+#include "basic_components.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "const.h"
+#include "decoder.h"
+#include "parameter.h"
+#include "xmlParser.h"
+
+using namespace std;
+
+class selection_logic : public Component{
+public:
+        selection_logic(bool _is_default, int    win_entries_,
+                            int  issue_width_, const InputParameter *configure_interface,
+                            enum Device_ty device_ty_=Core_device,
+                            enum Core_type core_ty_=Inorder);//, const ParseXML *_XML_interface);
+        bool is_default;
+        InputParameter l_ip;
+        uca_org_t local_result;
+        const ParseXML *XML_interface;
+        int win_entries;
+        int issue_width;
+        int num_threads;
+        enum Device_ty device_ty;
+        enum Core_type core_ty;
+
+        void selection_power();
+        void leakage_feedback(double temperature); // TODO
+};
+
+class dep_resource_conflict_check : public Component{
+public:
+        dep_resource_conflict_check(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, int compare_bits_, bool _is_default=true);
+        InputParameter l_ip;
+        uca_org_t local_result;
+        double WNORn, WNORp, Wevalinvp, Wevalinvn, Wcompn, Wcompp, Wcomppreequ;
+        CoreDynParam  coredynp;
+        int compare_bits;
+        bool is_default;
+        statsDef       tdp_stats;
+        statsDef       rtp_stats;
+        statsDef       stats_t;
+        powerDef       power_t;
+
+        void conflict_check_power();
+        double compare_cap();
+        ~dep_resource_conflict_check(){
+                local_result.cleanup();
+        }
+
+        void leakage_feedback(double temperature);
+};
+
+class inst_decoder: public Component{
+public:
+        inst_decoder(bool _is_default, const InputParameter *configure_interface,
+                        int opcode_length_,
+                        int num_decoders_,
+                        bool x86_,
+                        enum Device_ty device_ty_=Core_device,
+                        enum Core_type core_ty_=Inorder);
+        inst_decoder();
+        bool is_default;
+        int  opcode_length;
+        int  num_decoders;
+        bool x86;
+        int  num_decoder_segments;
+        int  num_decoded_signals;
+        InputParameter l_ip;
+        uca_org_t local_result;
+        enum Device_ty device_ty;
+        enum Core_type core_ty;
+
+        Decoder * final_dec;
+        Predec *  pre_dec;
+
+        statsDef       tdp_stats;
+        statsDef       rtp_stats;
+        statsDef       stats_t;
+        powerDef       power_t;
+        void inst_decoder_delay_power();
+        ~inst_decoder();
+        void leakage_feedback(double temperature);
+};
+
+class DFFCell : public Component {
+public:
+        DFFCell(bool _is_dram, double _WdecNANDn, double _WdecNANDp,double _cell_load,
+                          const InputParameter *configure_interface);
+        InputParameter l_ip;
+        bool is_dram;
+        double cell_load;
+        double WdecNANDn;
+        double WdecNANDp;
+        double clock_cap;
+        int    model;
+        int    n_switch;
+        int    n_keep_1;
+        int    n_keep_0;
+        int    n_clock;
+        powerDef e_switch;
+        powerDef e_keep_1;
+        powerDef e_keep_0;
+        powerDef e_clock;
+
+        double fpfp_node_cap(unsigned int fan_in, unsigned int fan_out);
+        void compute_DFF_cell(void);
+        };
+
+class Pipeline : public Component{
+public:
+        Pipeline(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, enum Device_ty device_ty_=Core_device, bool _is_core_pipeline=true, bool _is_default=true);
+        InputParameter l_ip;
+        uca_org_t local_result;
+        CoreDynParam  coredynp;
+        enum Device_ty device_ty;
+        bool is_core_pipeline, is_default;
+        double num_piperegs;
+//     int pipeline_stages;
+//     int tot_stage_vector, per_stage_vector;
+        bool process_ind;
+        double WNANDn ;
+        double WNANDp;
+        double load_per_pipeline_stage;
+//     int  Hthread,  num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length;
+//     int  PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width;
+//     bool thread_clock_gated;
+//     bool in_order, multithreaded;
+        void compute_stage_vector();
+        void compute();
+        ~Pipeline(){
+                local_result.cleanup();
+        };
+
+};
+
+//class core_pipeline :public pipeline{
+//public:
+//     int  Hthread,  num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length;
+//     int  PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width;
+//     bool thread_clock_gated;
+//     bool in_order, multithreaded;
+//     core_pipeline(bool _is_default, const InputParameter *configure_interface);
+//     virtual void compute_stage_vector();
+//
+//};
+
+class FunctionalUnit :public Component{
+public:
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double FU_height;
+        double clockRate,executionTime;
+        double num_fu;
+        double energy, base_energy,per_access_energy, leakage, gate_leakage;
+        bool  is_default;
+        enum FU_type fu_type;
+        statsDef       tdp_stats;
+        statsDef       rtp_stats;
+        statsDef       stats_t;
+        powerDef       power_t;
+
+        FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type);
+    void computeEnergy(bool is_tdp=true);
+        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    void leakage_feedback(double temperature);
+
+};
+
+class UndiffCore :public Component{
+public:
+        UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true, bool embedded_=false);
+        ParseXML *XML;
+        int  ithCore;
+        InputParameter interface_ip;
+        CoreDynParam  coredynp;
+        double clockRate,executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        enum  Core_type core_ty;
+        bool   opt_performance, embedded;
+        double pipeline_stage,num_hthreads,issue_width;
+        bool   is_default;
+
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~UndiffCore(){};
+        bool exist;
+
+
+};
+#endif /* LOGIC_H_ */
diff --git a/ext/mcpat/main.cc b/ext/mcpat/main.cc
new file mode 100644 (file)
index 0000000..8acce8d
--- /dev/null
@@ -0,0 +1,101 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include <iostream>
+
+#include "XML_Parse.h"
+#include "globalvar.h"
+#include "io.h"
+#include "processor.h"
+#include "version.h"
+#include "xmlParser.h"
+
+using namespace std;
+
+void print_usage(char * argv0);
+
+int main(int argc,char *argv[])
+{
+        char * fb ;
+        bool infile_specified     = false;
+        int  plevel               = 2;
+        opt_for_clk    =true;
+        //cout.precision(10);
+        if (argc <= 1 || argv[1] == string("-h") || argv[1] == string("--help"))
+        {
+                print_usage(argv[0]);
+        }
+
+        for (int32_t i = 0; i < argc; i++)
+        {
+                if (argv[i] == string("-infile"))
+                {
+                        infile_specified = true;
+                        i++;
+                        fb = argv[ i];
+                }
+
+                if (argv[i] == string("-print_level"))
+                {
+                        i++;
+                        plevel = atoi(argv[i]);
+                }
+
+                if (argv[i] == string("-opt_for_clk"))
+                {
+                        i++;
+                        opt_for_clk = (bool)atoi(argv[i]);
+                }
+        }
+        if (infile_specified == false)
+        {
+                print_usage(argv[0]);
+        }
+
+
+        cout<<"McPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
+                << " of " << VER_UPDATE << ") is computing the target processor...\n "<<endl;
+
+        //parse XML-based interface
+        ParseXML *p1= new ParseXML();
+        p1->parse(fb);
+        Processor proc(p1);
+        proc.displayEnergy(2, plevel);
+        delete p1;
+        return 0;
+}
+
+void print_usage(char * argv0)
+{
+    cerr << "How to use McPAT:" << endl;
+    cerr << "  mcpat -infile <input file name>  -print_level < level of details 0~5 >  -opt_for_clk < 0 (optimize for ED^2P only)/1 (optimzed for target clock rate)>"<< endl;
+    //cerr << "    Note:default print level is at processor level, please increase it to see the details" << endl;
+    exit(1);
+}
diff --git a/ext/mcpat/makefile b/ext/mcpat/makefile
new file mode 100644 (file)
index 0000000..27f213f
--- /dev/null
@@ -0,0 +1,28 @@
+TAR = mcpat
+
+.PHONY: dbg opt depend clean clean_dbg clean_opt
+
+all: opt
+
+dbg: $(TAR).mk obj_dbg
+       @$(MAKE) TAG=dbg -C . -f $(TAR).mk
+
+opt: $(TAR).mk obj_opt
+       @$(MAKE) TAG=opt -C . -f $(TAR).mk
+
+obj_dbg:
+       mkdir $@
+
+obj_opt:
+       mkdir $@
+
+clean: clean_dbg clean_opt
+
+clean_dbg: obj_dbg
+       @$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
+       rm -rf $<
+
+clean_opt: obj_opt
+       @$(MAKE) TAG=opt -C . -f $(TAR).mk clean
+       rm -rf $<
+
diff --git a/ext/mcpat/mcpat.mk b/ext/mcpat/mcpat.mk
new file mode 100644 (file)
index 0000000..9aacbe0
--- /dev/null
@@ -0,0 +1,81 @@
+TARGET = mcpat
+SHELL = /bin/sh
+.PHONY: all depend clean
+.SUFFIXES: .cc .o
+
+ifndef NTHREADS
+  NTHREADS = 4
+endif
+
+
+LIBS = 
+INCS = -lm
+
+ifeq ($(TAG),dbg)
+  DBG = -Wall 
+  OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
+else
+  DBG = 
+  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
+  #OPT = -O0 -DNTHREADS=$(NTHREADS)
+endif
+
+#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
+CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
+CXX = g++ -m32
+CC  = gcc -m32
+
+VPATH = cacti
+
+SRCS  = \
+  Ucache.cc \
+  XML_Parse.cc \
+  arbiter.cc \
+  area.cc \
+  array.cc \
+  bank.cc \
+  basic_circuit.cc \
+  basic_components.cc \
+  cacti_interface.cc \
+  component.cc \
+  core.cc \
+  crossbar.cc \
+  decoder.cc \
+  htree2.cc \
+  interconnect.cc \
+  io.cc \
+  iocontrollers.cc \
+  logic.cc \
+  main.cc \
+  mat.cc \
+  memoryctrl.cc \
+  noc.cc \
+  nuca.cc \
+  parameter.cc \
+  processor.cc \
+  router.cc \
+  sharedcache.cc \
+  subarray.cc \
+  technology.cc \
+  uca.cc \
+  wire.cc \
+  xmlParser.cc 
+
+OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
+
+all: obj_$(TAG)/$(TARGET)
+       cp -f obj_$(TAG)/$(TARGET) $(TARGET)
+
+obj_$(TAG)/$(TARGET) : $(OBJS)
+       $(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
+
+#obj_$(TAG)/%.o : %.cc
+#      $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
+
+obj_$(TAG)/%.o : %.cc
+       $(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+       -rm -f *.o $(TARGET)
+
+
diff --git a/ext/mcpat/mcpatXeonCore.mk b/ext/mcpat/mcpatXeonCore.mk
new file mode 100644 (file)
index 0000000..20cf0dd
--- /dev/null
@@ -0,0 +1,81 @@
+TARGET = mcpatXeonCore
+SHELL = /bin/sh
+.PHONY: all depend clean
+.SUFFIXES: .cc .o
+
+ifndef NTHREADS
+  NTHREADS = 4
+endif
+
+
+LIBS = 
+INCS = -lm
+
+ifeq ($(TAG),dbg)
+  DBG = -Wall 
+  OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
+else
+  DBG = 
+  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
+  #OPT = -O0 -DNTHREADS=$(NTHREADS)
+endif
+
+#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
+CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
+CXX = g++ -m32
+CC  = gcc -m32
+
+VPATH = cacti
+
+SRCS  = \
+  Ucache.cc \
+  XML_Parse.cc \
+  arbiter.cc \
+  area.cc \
+  array.cc \
+  bank.cc \
+  basic_circuit.cc \
+  basic_components.cc \
+  cacti_interface.cc \
+  component.cc \
+  core.cc \
+  crossbar.cc \
+  decoder.cc \
+  htree2.cc \
+  interconnect.cc \
+  io.cc \
+  iocontrollers.cc \
+  logic.cc \
+  main.cc \
+  mat.cc \
+  memoryctrl.cc \
+  noc.cc \
+  nuca.cc \
+  parameter.cc \
+  processor.cc \
+  router.cc \
+  sharedcache.cc \
+  subarray.cc \
+  technology_xeon_core.cc \
+  uca.cc \
+  wire.cc \
+  xmlParser.cc 
+
+OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
+
+all: obj_$(TAG)/$(TARGET)
+       cp -f obj_$(TAG)/$(TARGET) $(TARGET)
+
+obj_$(TAG)/$(TARGET) : $(OBJS)
+       $(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
+
+#obj_$(TAG)/%.o : %.cc
+#      $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
+
+obj_$(TAG)/%.o : %.cc
+       $(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+       -rm -f *.o $(TARGET)
+
+
diff --git a/ext/mcpat/memoryctrl.cc b/ext/mcpat/memoryctrl.cc
new file mode 100644 (file)
index 0000000..ae3bc75
--- /dev/null
@@ -0,0 +1,736 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "XML_Parse.h"
+#include "basic_circuit.h"
+#include "basic_components.h"
+#include "const.h"
+#include "io.h"
+#include "logic.h"
+#include "memoryctrl.h"
+#include "parameter.h"
+
+/* overview of MC models:
+ * McPAT memory controllers are modeled according to large number of industrial data points.
+ * The Basic memory controller architecture is base on the Synopsis designs
+ * (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers)
+ * as in Cadence ChipEstimator Tool
+ *
+ * An MC has 3 parts as shown in this design. McPAT models both high performance MC
+ * based on Niagara processor designs and curving and low power MC based on data points in
+ * Cadence ChipEstimator Tool.
+ *
+ * The frontend is modeled analytically, the backend is modeled empirically according to
+ * DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator Tool
+ * The PHY is modeled based on
+ * "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006,
+ * and A 14mW 6.25Gb/s Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007
+ *
+ * In Cadence ChipEstimator Tool there are two types of memory controllers: the full memory controllers
+ * that includes the frontend as the DesignWare DDR2/DDR3-Lite memory controllers and the backend only
+ * memory controllers as the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite memory
+ * controllers, all memory controller IP in Cadence ChipEstimator Tool are backend memory controllers such as
+ * DDRC 1600A and DDRC 800A). Thus,to some extend the area and power difference between DesignWare
+ * DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers can be an estimation to the
+ * frontend power and area, which is very close the analitically modeled results of the frontend for Niagara2@65nm
+ *
+ */
+
+MCBackend::MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
+:l_ip(*interface_ip_),
+ mc_type(mc_type_),
+ mcp(mcp_)
+{
+
+  local_result = init_interface(&l_ip);
+  compute();
+
+}
+
+
+void MCBackend::compute()
+{
+  //double max_row_addr_width = 20.0;//Current address 12~18bits
+  double C_MCB, mc_power, backend_dyn, backend_gates;//, refresh_period,refresh_freq;//Equivalent per bit Cap for backend,
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double NMOS_sizing, PMOS_sizing;
+
+  if (mc_type == MC)
+  {
+          if (mcp.type == 0)
+          {
+                  //area = (2.2927*log(peakDataTransferRate)-14.504)*memDataWidth/144.0*(l_ip.F_sz_um/0.09);
+                  area.set_area((2.7927*log(mcp.peakDataTransferRate*2)-19.862)/2.0*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6);//um^2
+                  //assuming the approximately same scaling factor as seen in processors.
+                  //C_MCB=0.2/1.3/1.3/266/64/0.09*g_ip.F_sz_um;//based on AMD Geode processor which has a very basic mc on chip.
+                  //C_MCB = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power numbers.The base power (W) is divided by device frequency and vdd and scale to target process.
+                  //mc_power = 0.0291*2;//29.1mW@200MHz @130nm From Power Analysis of SystemLevel OnChip Communication Architectures by Lahiri et
+                  mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
+                  C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
+                  power_t.readOp.dynamic = C_MCB*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(mcp.dataBusWidth/*+mcp.addressBusWidth*/);//per access energy in memory controller
+                  power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+
+          }
+          else
+          {   NMOS_sizing        = g_tp.min_w_nmos_;
+                  PMOS_sizing    = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+                  area.set_area(0.15*mcp.dataBusWidth/72.0*(l_ip.F_sz_um/0.065)* (l_ip.F_sz_um/0.065)*mcp.num_channels*1e6);//um^2
+                  backend_dyn = 0.9e-9/800e6*mcp.clockRate/12800*mcp.peakDataTransferRate*mcp.dataBusWidth/72.0*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(l_ip.F_sz_nm/65.0);//Average on DDR2/3 protocol controller and DDRC 1600/800A in Cadence ChipEstimate
+                  //Scaling to technology and DIMM feature. The base IP support DDR3-1600(PC3 12800)
+                  backend_gates = 50000*mcp.dataBusWidth/64.0;//5000 is from Cadence ChipEstimator
+
+                  power_t.readOp.dynamic = backend_dyn;
+                  power_t.readOp.leakage = (backend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = (backend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+
+          }
+  }
+  else
+  {//skip old model
+          cout<<"Unknown memory controllers"<<endl;exit(0);
+          area.set_area(0.243*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
+          //mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
+          C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
+          power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+          power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+          power_t.readOp.dynamic *= 1.2;
+          power_t.readOp.leakage *= 1.2;
+          power_t.readOp.gate_leakage *= 1.2;
+          //flash controller has about 20% more backend power since BCH ECC in flash is complex and power hungry
+  }
+  double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+}
+
+void MCBackend::computeEnergy(bool is_tdp)
+{
+        //backend uses internal data buswidth
+        if (is_tdp)
+        {
+                //init stats for Peak
+                stats_t.readAc.access   = 0.5*mcp.num_channels;
+                stats_t.writeAc.access  = 0.5*mcp.num_channels;
+                tdp_stats = stats_t;
+        }
+        else
+        {
+                //init stats for runtime power (RTP)
+                stats_t.readAc.access   = mcp.reads;
+                stats_t.writeAc.access  = mcp.writes;
+                tdp_stats = stats_t;
+        }
+        if (is_tdp)
+    {
+                power = power_t;
+                power.readOp.dynamic   = (stats_t.readAc.access + stats_t.writeAc.access)*power_t.readOp.dynamic;
+
+    }
+    else
+    {
+        rt_power.readOp.dynamic        = (stats_t.readAc.access + stats_t.writeAc.access)*mcp.llcBlockSize*8.0/mcp.dataBusWidth*power_t.readOp.dynamic;
+        rt_power = rt_power + power_t*pppm_lkg;
+        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
+        //Assume 10% of peak power is consumed by routine job including memory refreshing and scrubbing
+    }
+}
+
+
+MCPHY::MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
+:l_ip(*interface_ip_),
+ mc_type(mc_type_),
+ mcp(mcp_)
+{
+
+  local_result = init_interface(&l_ip);
+  compute();
+}
+
+void MCPHY::compute()
+{
+  //PHY uses internal data buswidth but the actuall off-chip datawidth is 64bits + ecc
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio() ;
+  /*
+   * according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006;
+   * From Cadence ChipEstimator for normal I/O around 0.4~0.8 mW/Gb/s
+   */
+  double power_per_gb_per_s, phy_dyn,phy_gates, NMOS_sizing, PMOS_sizing;
+
+  if (mc_type == MC)
+  {
+          if (mcp.type == 0)
+          {
+                  power_per_gb_per_s = mcp.LVDS? 0.01:0.04;
+                  //Based on die photos from Niagara 1 and 2.
+                  //TODO merge this into undifferentiated core.PHY only achieves square root of the ideal scaling.
+                  //area = (6.4323*log(peakDataTransferRate)-34.76)*memDataWidth/128.0*(l_ip.F_sz_um/0.09);
+                  area.set_area((6.4323*log(mcp.peakDataTransferRate*2)-48.134)*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6/2);//TODO:/2
+                  //This is from curve fitting based on Niagara 1 and 2's PHY die photo.
+                  //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
+                  //power.readOp.dynamic = 0.02*memAccesses*llcBlocksize*8;//change from Bytes to bits.
+                  power_t.readOp.dynamic = power_per_gb_per_s*sqrt(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+
+          }
+          else
+          {
+                  NMOS_sizing    = g_tp.min_w_nmos_;
+                  PMOS_sizing    = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+                  //Designware/synopsis 16bit DDR3 PHY is 1.3mm (WITH IOs) at 40nm for upto DDR3 2133 (PC3 17066)
+                  double non_IO_percentage = 0.2;
+                  area.set_area(1.3*non_IO_percentage/2133.0e6*mcp.clockRate/17066*mcp.peakDataTransferRate*mcp.dataBusWidth/16.0*(l_ip.F_sz_um/0.040)* (l_ip.F_sz_um/0.040)*mcp.num_channels*1e6);//um^2
+                  phy_gates = 200000*mcp.dataBusWidth/64.0;
+                  power_per_gb_per_s = 0.01;
+                  //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
+                  power_t.readOp.dynamic = power_per_gb_per_s*(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  power_t.readOp.leakage = (mcp.withPHY? phy_gates:0)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = (mcp.withPHY? phy_gates:0)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+          }
+
+  }
+  else
+  {
+          area.set_area(0.4e6/2*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
+  }
+
+//  double phy_factor = (int)ceil(mcp.dataBusWidth/72.0);//Previous phy power numbers are based on 72 bit DIMM interface
+//  power_t.readOp.dynamic *= phy_factor;
+//  power_t.readOp.leakage *= phy_factor;
+//  power_t.readOp.gate_leakage *= phy_factor;
+
+  double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+}
+
+
+void MCPHY::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+        {
+                //init stats for Peak
+                stats_t.readAc.access   = 0.5*mcp.num_channels; //time share on buses
+                stats_t.writeAc.access  = 0.5*mcp.num_channels;
+                tdp_stats = stats_t;
+        }
+        else
+        {
+                //init stats for runtime power (RTP)
+                stats_t.readAc.access   = mcp.reads;
+                stats_t.writeAc.access  = mcp.writes;
+                tdp_stats = stats_t;
+        }
+
+        if (is_tdp)
+    {
+                double data_transfer_unit = (mc_type == MC)? 72:16;/*DIMM data width*/
+                power = power_t;
+                power.readOp.dynamic   = power.readOp.dynamic * (mcp.peakDataTransferRate*8*1e6/1e9/*change to Gbs*/)*mcp.dataBusWidth/data_transfer_unit*mcp.num_channels/mcp.clockRate;
+                // divide by clock rate is for match the final computation where *clock is used
+                //(stats_t.readAc.access*power_t.readOp.dynamic+
+//                                     stats_t.writeAc.access*power_t.readOp.dynamic);
+
+    }
+    else
+    {
+        rt_power = power_t;
+//     rt_power.readOp.dynamic = (stats_t.readAc.access*power_t.readOp.dynamic+
+//                                             stats_t.writeAc.access*power_t.readOp.dynamic);
+
+        rt_power.readOp.dynamic=power_t.readOp.dynamic*(stats_t.readAc.access + stats_t.writeAc.access)*(mcp.llcBlockSize)*8/1e9/mcp.executionTime*(mcp.executionTime);
+        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
+    }
+}
+
+MCFrontEnd::MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_),
+ mc_type(mc_type_),
+ mcp(mcp_),
+ MC_arb(0),
+ frontendBuffer(0),
+ readBuffer(0),
+ writeBuffer(0)
+{
+  /* All computations are for a single MC
+   *
+   */
+
+  int tag, data;
+  bool is_default =true;//indication for default setup
+
+  /* MC frontend engine channels share the same engines but logically partitioned
+   * For all hardware inside MC. different channels do not share resources.
+   * TODO: add docodeing/mux stage to steer memory requests to different channels.
+   */
+
+  //memory request reorder buffer
+  tag                                                     = mcp.addressBusWidth  + EXTRA_TAG_BITS + mcp.opcodeW;
+  data                                                    = int(ceil((XML->sys.physical_address_width + mcp.opcodeW)/8.0));
+  interface_ip.cache_sz            = data*XML->sys.mc.req_window_size_per_channel;
+  interface_ip.line_sz             = data;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = 1.0/mcp.clockRate;
+  interface_ip.latency             = 1.0/mcp.clockRate;
+  interface_ip.is_cache                           = true;
+  interface_ip.pure_cam            = false;
+  interface_ip.pure_ram            = false;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 0;
+  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports     = 0;
+  interface_ip.num_search_ports     = XML->sys.mc.memory_channels_per_mc;
+  frontendBuffer = new ArrayST(&interface_ip, "MC ReorderBuffer", Uncore_device);
+  frontendBuffer->area.set_area(frontendBuffer->area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+
+  //selection and arbitration logic
+  MC_arb = new selection_logic(is_default, XML->sys.mc.req_window_size_per_channel,1,&interface_ip, Uncore_device);
+
+  //read buffers.
+  data                                                    = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
+  interface_ip.cache_sz            = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
+  interface_ip.line_sz             = data;
+  interface_ip.assoc               = 1;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 1;
+  interface_ip.throughput          = 1.0/mcp.clockRate;
+  interface_ip.latency             = 1.0/mcp.clockRate;
+  interface_ip.is_cache                           = false;
+  interface_ip.pure_cam            = false;
+  interface_ip.pure_ram            = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 0;//XML->sys.mc.memory_channels_per_mc*2>2?2:XML->sys.mc.memory_channels_per_mc*2;
+  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports     = 0;
+  readBuffer = new ArrayST(&interface_ip, "MC ReadBuffer", Uncore_device);
+  readBuffer->area.set_area(readBuffer->area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+
+  //write buffer
+  data                                                    = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
+  interface_ip.cache_sz            = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
+  interface_ip.line_sz             = data;
+  interface_ip.assoc               = 1;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = 1.0/mcp.clockRate;
+  interface_ip.latency             = 1.0/mcp.clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 0;
+  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports     = 0;
+  writeBuffer = new ArrayST(&interface_ip, "MC writeBuffer", Uncore_device);
+  writeBuffer->area.set_area(writeBuffer->area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+}
+
+void MCFrontEnd::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+            {
+                //init stats for Peak
+                frontendBuffer->stats_t.readAc.access  = frontendBuffer->l_ip.num_search_ports;
+                frontendBuffer->stats_t.writeAc.access = frontendBuffer->l_ip.num_wr_ports;
+                frontendBuffer->tdp_stats = frontendBuffer->stats_t;
+
+                readBuffer->stats_t.readAc.access  = readBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
+                readBuffer->stats_t.writeAc.access = readBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
+                readBuffer->tdp_stats = readBuffer->stats_t;
+
+                writeBuffer->stats_t.readAc.access  = writeBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
+                writeBuffer->stats_t.writeAc.access = writeBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
+                writeBuffer->tdp_stats = writeBuffer->stats_t;
+
+            }
+            else
+            {
+                //init stats for runtime power (RTP)
+                frontendBuffer->stats_t.readAc.access  = XML->sys.mc.memory_reads *mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
+                //For each channel, each memory word need to check the address data to achieve best scheduling results.
+                //and this need to be done on all physical DIMMs in each logical memory DIMM *mcp.dataBusWidth/72
+                frontendBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
+                frontendBuffer->rtp_stats = frontendBuffer->stats_t;
+
+                readBuffer->stats_t.readAc.access  = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
+                readBuffer->stats_t.writeAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
+                readBuffer->rtp_stats = readBuffer->stats_t;
+
+                writeBuffer->stats_t.readAc.access  = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
+                writeBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
+                writeBuffer->rtp_stats = writeBuffer->stats_t;
+            }
+
+        frontendBuffer->power_t.reset();
+        readBuffer->power_t.reset();
+        writeBuffer->power_t.reset();
+
+//     frontendBuffer->power_t.readOp.dynamic  += (frontendBuffer->stats_t.readAc.access*
+//                     (frontendBuffer->local_result.power.searchOp.dynamic+frontendBuffer->local_result.power.readOp.dynamic)+
+//             frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic);
+
+                frontendBuffer->power_t.readOp.dynamic += (frontendBuffer->stats_t.readAc.access +
+                                  frontendBuffer->stats_t.writeAc.access)*frontendBuffer->local_result.power.searchOp.dynamic
+                                + frontendBuffer->stats_t.readAc.access * frontendBuffer->local_result.power.readOp.dynamic
+                                + frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic;
+
+        readBuffer->power_t.readOp.dynamic     += (readBuffer->stats_t.readAc.access*
+                        readBuffer->local_result.power.readOp.dynamic+
+                readBuffer->stats_t.writeAc.access*readBuffer->local_result.power.writeOp.dynamic);
+        writeBuffer->power_t.readOp.dynamic    += (writeBuffer->stats_t.readAc.access*
+                        writeBuffer->local_result.power.readOp.dynamic+
+                writeBuffer->stats_t.writeAc.access*writeBuffer->local_result.power.writeOp.dynamic);
+
+        if (is_tdp)
+    {
+        power = power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
+                (frontendBuffer->local_result.power +
+                                readBuffer->local_result.power +
+                                writeBuffer->local_result.power)*pppm_lkg;
+
+    }
+    else
+    {
+        rt_power = rt_power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
+                (frontendBuffer->local_result.power +
+                                readBuffer->local_result.power +
+                                writeBuffer->local_result.power)*pppm_lkg;
+        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
+    }
+}
+
+void MCFrontEnd::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+
+        if (is_tdp)
+        {
+                cout << indent_str << "Front End ROB:" << endl;
+                cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << frontendBuffer->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << frontendBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+
+                cout <<endl;
+                cout << indent_str<< "Read Buffer:" << endl;
+                cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << readBuffer->power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << readBuffer->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << readBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "Write Buffer:" << endl;
+                cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << writeBuffer->power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << writeBuffer->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << writeBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+                cout << indent_str << "Front End ROB:" << endl;
+                cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->rt_power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << frontendBuffer->rt_power.readOp.gate_leakage << " W" << endl;
+                cout <<endl;
+                cout << indent_str<< "Read Buffer:" << endl;
+                cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << readBuffer->rt_power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << readBuffer->rt_power.readOp.gate_leakage  << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "Write Buffer:" << endl;
+                cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << writeBuffer->rt_power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << writeBuffer->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+
+}
+
+
+MemoryController::MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_),
+ mc_type(mc_type_),
+ frontend(0),
+ transecEngine(0),
+ PHY(0),
+ pipeLogic(0)
+{
+  /* All computations are for a single MC
+   *
+   */
+  interface_ip.wire_is_mat_type = 2;
+  interface_ip.wire_os_mat_type = 2;
+  interface_ip.wt               =Global;
+  set_mc_param();
+  frontend = new MCFrontEnd(XML, &interface_ip, mcp, mc_type);
+  area.set_area(area.get_area()+ frontend->area.get_area());
+  transecEngine = new MCBackend(&interface_ip, mcp, mc_type);
+  area.set_area(area.get_area()+ transecEngine->area.get_area());
+  if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+  {
+          PHY = new MCPHY(&interface_ip, mcp, mc_type);
+          area.set_area(area.get_area()+ PHY->area.get_area());
+  }
+  //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
+//  transecEngine.initialize(&interface_ip);
+//  transecEngine.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
+//  transecEngine.memDataWidth = dataBusWidth;
+//  transecEngine.memRank = XML->sys.mem.number_ranks;
+//  //transecEngine.memAccesses=XML->sys.mc.memory_accesses;
+//  //transecEngine.llcBlocksize=llcBlockSize;
+//  transecEngine.compute();
+//  transecEngine.area.set_area(XML->sys.mc.memory_channels_per_mc*transecEngine.area.get_area()) ;
+//  area.set_area(area.get_area()+ transecEngine.area.get_area());
+//  ///cout<<"area="<<area<<endl;
+////
+//  //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+//  PHY.initialize(&interface_ip);
+//  PHY.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
+//  PHY.memDataWidth = dataBusWidth;
+//  //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+//  //PHY.llcBlocksize=llcBlockSize;
+//  PHY.compute();
+//  PHY.area.set_area(XML->sys.mc.memory_channels_per_mc*PHY.area.get_area()) ;
+//  area.set_area(area.get_area()+ PHY.area.get_area());
+  ///cout<<"area="<<area<<endl;
+//
+//  interface_ip.pipeline_stages = 5;//normal memory controller has five stages in the pipeline.
+//  interface_ip.per_stage_vector = addressBusWidth + XML->sys.core[0].opcode_width + dataBusWidth;
+//  pipeLogic = new pipeline(is_default, &interface_ip);
+//  //pipeLogic.init_pipeline(is_default, &interface_ip);
+//  pipeLogic->compute_pipeline();
+//  area.set_area(area.get_area()+ pipeLogic->area.get_area()*1e-6);
+//  area.set_area((area.get_area()+mc_area*1e-6)*1.1);//placement and routing overhead
+//
+//
+////  //clock
+////  clockNetwork.init_wire_external(is_default, &interface_ip);
+////  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
+////  clockNetwork.end_wiring_level   =5;//toplevel metal
+////  clockNetwork.start_wiring_level =5;//toplevel metal
+////  clockNetwork.num_regs           = pipeLogic.tot_stage_vector;
+////  clockNetwork.optimize_wire();
+
+
+}
+void MemoryController::computeEnergy(bool is_tdp)
+{
+
+        frontend->computeEnergy(is_tdp);
+        transecEngine->computeEnergy(is_tdp);
+        if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+        {
+                PHY->computeEnergy(is_tdp);
+        }
+        if (is_tdp)
+        {
+                power = power + frontend->power + transecEngine->power;
+                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+                {
+                        power = power + PHY->power;
+                }
+        }
+        else
+        {
+                rt_power = rt_power + frontend->rt_power + transecEngine->rt_power;
+                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+                {
+                        rt_power = rt_power + PHY->rt_power;
+                }
+        }
+}
+
+void MemoryController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << "Memory Controller:" << endl;
+                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str<< "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout<<endl;
+                cout << indent_str << "Front End Engine:" << endl;
+                cout << indent_str_next << "Area = " << frontend->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << frontend->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? frontend->power.readOp.longer_channel_leakage:frontend->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << frontend->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << frontend->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+                if (plevel >2){
+                        frontend->displayEnergy(indent+4,is_tdp);
+                }
+                cout << indent_str << "Transaction Engine:" << endl;
+                cout << indent_str_next << "Area = " << transecEngine->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << transecEngine->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? transecEngine->power.readOp.longer_channel_leakage:transecEngine->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << transecEngine->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << transecEngine->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+                {
+                        cout << indent_str << "PHY:" << endl;
+                        cout << indent_str_next << "Area = " << PHY->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << PHY->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? PHY->power.readOp.longer_channel_leakage:PHY->power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << PHY->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << PHY->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                        cout <<endl;
+                }
+        }
+        else
+        {
+                cout << "Memory Controller:" << endl;
+                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout<<endl;
+        }
+
+}
+
+void MemoryController::set_mc_param()
+{
+
+        if (mc_type==MC)
+        {
+          mcp.clockRate       =XML->sys.mc.mc_clock*2;//DDR double pumped
+          mcp.clockRate       *= 1e6;
+          mcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+
+          mcp.llcBlockSize    =int(ceil(XML->sys.mc.llc_line_length/8.0))+XML->sys.mc.llc_line_length;//ecc overhead
+          mcp.dataBusWidth    =int(ceil(XML->sys.mc.databus_width/8.0)) + XML->sys.mc.databus_width;
+          mcp.addressBusWidth =int(ceil(XML->sys.mc.addressbus_width));//XML->sys.physical_address_width;
+          mcp.opcodeW         =16;
+          mcp.num_mcs         = XML->sys.mc.number_mcs;
+          mcp.num_channels    = XML->sys.mc.memory_channels_per_mc;
+          mcp.reads  = XML->sys.mc.memory_reads;
+          mcp.writes = XML->sys.mc.memory_writes;
+          //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
+          mcp.peakDataTransferRate = XML->sys.mc.peak_transfer_rate;
+          mcp.memRank = XML->sys.mc.number_ranks;
+          //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+          //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+          //PHY.llcBlocksize=llcBlockSize;
+          mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
+          mcp.LVDS = XML->sys.mc.LVDS;
+          mcp.type = XML->sys.mc.type;
+          mcp.withPHY = XML->sys.mc.withPHY;
+        }
+//     else if (mc_type==FLASHC)
+//     {
+//             mcp.clockRate       =XML->sys.flashc.mc_clock*2;//DDR double pumped
+//             mcp.clockRate       *= 1e6;
+//             mcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+//
+//             mcp.llcBlockSize    =int(ceil(XML->sys.flashc.llc_line_length/8.0))+XML->sys.flashc.llc_line_length;//ecc overhead
+//             mcp.dataBusWidth    =int(ceil(XML->sys.flashc.databus_width/8.0)) + XML->sys.flashc.databus_width;
+//             mcp.addressBusWidth =int(ceil(XML->sys.flashc.addressbus_width));//XML->sys.physical_address_width;
+//             mcp.opcodeW         =16;
+//             mcp.num_mcs         = XML->sys.flashc.number_mcs;
+//             mcp.num_channels    = XML->sys.flashc.memory_channels_per_mc;
+//             mcp.reads  = XML->sys.flashc.memory_reads;
+//             mcp.writes = XML->sys.flashc.memory_writes;
+//             //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
+//             mcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
+//             mcp.memRank = XML->sys.flashc.number_ranks;
+//             //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+//             //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+//             //PHY.llcBlocksize=llcBlockSize;
+//             mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
+//             mcp.LVDS = XML->sys.flashc.LVDS;
+//             mcp.type = XML->sys.flashc.type;
+//     }
+        else
+        {
+                cout<<"Unknown memory controller type: neither DRAM controller nor Flash controller" <<endl;
+                exit(0);
+        }
+}
+
+MCFrontEnd ::~MCFrontEnd(){
+
+        if(MC_arb)                    {delete MC_arb; MC_arb = 0;}
+        if(frontendBuffer)            {delete frontendBuffer; frontendBuffer = 0;}
+        if(readBuffer)                    {delete readBuffer; readBuffer = 0;}
+        if(writeBuffer)               {delete writeBuffer; writeBuffer = 0;}
+}
+
+MemoryController ::~MemoryController(){
+
+        if(frontend)                  {delete frontend; frontend = 0;}
+        if(transecEngine)                 {delete transecEngine; transecEngine = 0;}
+        if(PHY)                           {delete PHY; PHY = 0;}
+        if(pipeLogic)                 {delete pipeLogic; pipeLogic = 0;}
+}
+
diff --git a/ext/mcpat/memoryctrl.h b/ext/mcpat/memoryctrl.h
new file mode 100644 (file)
index 0000000..65be20a
--- /dev/null
@@ -0,0 +1,113 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef MEMORYCTRL_H_
+#define MEMORYCTRL_H_
+
+#include "XML_Parse.h"
+#include "parameter.h"
+//#include "io.h"
+#include "array.h"
+//#include "Undifferentiated_Core_Area.h"
+#include <vector>
+
+#include "basic_components.h"
+
+class MCBackend : public Component {
+  public:
+    InputParameter l_ip;
+    uca_org_t local_result;
+        enum MemoryCtrl_type mc_type;
+    MCParam  mcp;
+    statsDef tdp_stats;
+    statsDef rtp_stats;
+    statsDef stats_t;
+    powerDef power_t;
+    MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
+    void compute();
+        void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~MCBackend(){};
+};
+
+class MCPHY : public Component {
+  public:
+    InputParameter l_ip;
+    uca_org_t local_result;
+        enum MemoryCtrl_type mc_type;
+    MCParam  mcp;
+    statsDef       tdp_stats;
+    statsDef       rtp_stats;
+    statsDef       stats_t;
+    powerDef       power_t;
+    MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
+    void compute();
+        void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~MCPHY(){};
+};
+
+class MCFrontEnd : public Component {
+  public:
+        ParseXML *XML;
+        InputParameter interface_ip;
+        enum MemoryCtrl_type mc_type;
+        MCParam  mcp;
+        selection_logic * MC_arb;
+        ArrayST  * frontendBuffer;
+        ArrayST  * readBuffer;
+        ArrayST  * writeBuffer;
+
+    MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~MCFrontEnd();
+};
+
+class MemoryController : public Component {
+  public:
+        ParseXML *XML;
+        InputParameter interface_ip;
+        enum MemoryCtrl_type mc_type;
+    MCParam  mcp;
+        MCFrontEnd * frontend;
+    MCBackend * transecEngine;
+    MCPHY       * PHY;
+    Pipeline * pipeLogic;
+
+    //clock_network clockNetwork;
+    MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_);
+    void set_mc_param();
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    ~MemoryController();
+};
+#endif /* MEMORYCTRL_H_ */
diff --git a/ext/mcpat/noc.cc b/ext/mcpat/noc.cc
new file mode 100644 (file)
index 0000000..d5dfbb1
--- /dev/null
@@ -0,0 +1,355 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "XML_Parse.h"
+#include "basic_circuit.h"
+#include "const.h"
+#include "io.h"
+#include "noc.h"
+#include "parameter.h"
+
+NoC::NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_, double link_len_)
+:XML(XML_interface),
+ithNoC(ithNoC_),
+interface_ip(*interface_ip_),
+router(0),
+link_bus(0),
+link_bus_exist(false),
+router_exist(false),
+M_traffic_pattern(M_traffic_pattern_)
+{
+        /*
+         * initialize, compute and optimize individual components.
+         */
+
+        if (XML->sys.Embedded)
+                        {
+                        interface_ip.wt                  =Global_30;
+                        interface_ip.wire_is_mat_type = 0;
+                        interface_ip.wire_os_mat_type = 1;
+                        }
+                else
+                        {
+                        interface_ip.wt                  =Global;
+                        interface_ip.wire_is_mat_type = 2;
+                        interface_ip.wire_os_mat_type = 2;
+                        }
+        set_noc_param();
+        local_result=init_interface(&interface_ip);
+        scktRatio = g_tp.sckt_co_eff;
+
+        if (nocdynp.type)
+        {/*
+                 * if NOC compute router, router links must be computed separately
+                 * and called from external
+                 * since total chip area must be known first
+                 */
+                init_router();
+        }
+        else
+        {
+                init_link_bus(link_len_); //if bus compute bus
+        }
+
+        //  //clock power
+        //  clockNetwork.init_wire_external(is_default, &interface_ip);
+        //  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
+        //  clockNetwork.end_wiring_level   =5;//toplevel metal
+        //  clockNetwork.start_wiring_level =5;//toplevel metal
+        //  clockNetwork.num_regs           = corepipe.tot_stage_vector;
+        //  clockNetwork.optimize_wire();
+}
+
+void NoC::init_router()
+{
+        router  = new Router(nocdynp.flit_size,
+                        nocdynp.virtual_channel_per_port*nocdynp.input_buffer_entries_per_vc,
+                        nocdynp.virtual_channel_per_port, &(g_tp.peri_global),
+                        nocdynp.input_ports,nocdynp.output_ports, M_traffic_pattern);
+        //router->print_router();
+        area.set_area(area.get_area()+ router->area.get_area()*nocdynp.total_nodes);
+
+        double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+        router->power.readOp.longer_channel_leakage          = router->power.readOp.leakage * long_channel_device_reduction;
+        router->buffer.power.readOp.longer_channel_leakage   = router->buffer.power.readOp.leakage * long_channel_device_reduction;
+        router->crossbar.power.readOp.longer_channel_leakage = router->crossbar.power.readOp.leakage * long_channel_device_reduction;
+        router->arbiter.power.readOp.longer_channel_leakage  = router->arbiter.power.readOp.leakage * long_channel_device_reduction;
+        router_exist = true;
+}
+
+void NoC ::init_link_bus(double link_len_)
+{
+
+
+//     if (nocdynp.min_ports==1 )
+        if (nocdynp.type)
+                link_name = "Links";
+        else
+                link_name = "Bus";
+
+        link_len=link_len_;
+        assert(link_len>0);
+
+        interface_ip.throughput = nocdynp.link_throughput/nocdynp.clockRate;
+        interface_ip.latency = nocdynp.link_latency/nocdynp.clockRate;
+
+        link_len /= (nocdynp.horizontal_nodes + nocdynp.vertical_nodes)/2;
+
+        if (nocdynp.total_nodes >1) link_len /=2; //All links are shared by neighbors
+        link_bus = new interconnect(name, Uncore_device, 1, 1, nocdynp.flit_size,
+                                  link_len, &interface_ip, 3, true/*pipelinable*/, nocdynp.route_over_perc);
+
+        link_bus_tot_per_Router.area.set_area(link_bus_tot_per_Router.area.get_area()+ link_bus->area.get_area()
+                        * nocdynp.global_linked_ports);
+
+        area.set_area(area.get_area()+ link_bus_tot_per_Router.area.get_area()* nocdynp.total_nodes);
+        link_bus_exist = true;
+}
+void NoC::computeEnergy(bool is_tdp)
+{
+        //power_point_product_masks
+        double pppm_t[4]    = {1,1,1,1};
+        double M=nocdynp.duty_cycle;
+        if (is_tdp)
+            {
+                //init stats for TDP
+                stats_t.readAc.access  = M;
+            tdp_stats = stats_t;
+            if (router_exist)
+            {
+                set_pppm(pppm_t, 1*M, 1, 1, 1);//reset traffic pattern
+                router->power = router->power*pppm_t;
+                set_pppm(pppm_t, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes);
+                    power     = power + router->power*pppm_t;
+            }
+            if (link_bus_exist)
+            {
+                if (nocdynp.type)
+                        set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports -1), nocdynp.global_linked_ports,
+                                nocdynp.global_linked_ports, nocdynp.global_linked_ports);
+                    //reset traffic pattern; local port do not have router links
+                else
+                        set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports), nocdynp.global_linked_ports,
+                                                        nocdynp.global_linked_ports, nocdynp.global_linked_ports);//reset traffic pattern
+
+                link_bus_tot_per_Router.power = link_bus->power*pppm_t;
+
+                set_pppm(pppm_t, nocdynp.total_nodes,
+                                         nocdynp.total_nodes,
+                                         nocdynp.total_nodes,
+                                         nocdynp.total_nodes);
+                power     = power + link_bus_tot_per_Router.power*pppm_t;
+
+            }
+            }
+            else
+            {
+                //init stats for runtime power (RTP)
+                stats_t.readAc.access  = XML->sys.NoC[ithNoC].total_accesses;
+            rtp_stats = stats_t;
+                set_pppm(pppm_t, 1, 0 , 0, 0);
+                if (router_exist)
+                {
+                router->buffer.rt_power.readOp.dynamic = (router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)*rtp_stats.readAc.access ;
+                router->crossbar.rt_power.readOp.dynamic = router->crossbar.power.readOp.dynamic*rtp_stats.readAc.access ;
+                router->arbiter.rt_power.readOp.dynamic = router->arbiter.power.readOp.dynamic*rtp_stats.readAc.access ;
+
+                        router->rt_power = router->rt_power + (router->buffer.rt_power + router->crossbar.rt_power + router->arbiter.rt_power)*pppm_t +
+                                        router->power*pppm_lkg;//TDP power must be calculated first!
+                        rt_power     = rt_power + router->rt_power;
+                }
+                if (link_bus_exist)
+                {
+                        set_pppm(pppm_t, rtp_stats.readAc.access, 1 , 1, rtp_stats.readAc.access);
+                        link_bus->rt_power = link_bus->power * pppm_t;
+                        rt_power = rt_power + link_bus->rt_power;
+                }
+
+            }
+}
+
+
+void NoC::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        double M =M_traffic_pattern*nocdynp.duty_cycle;
+        /*only router as a whole has been applied the M_traffic_pattern(0.6 by default) factor in router.cc;
+         *     When power of crossbars, arbiters, etc need to be displayed, the M_traffic_pattern factor need to
+         * be applied together with McPAT's extra traffic pattern.
+         * */
+        if (is_tdp)
+        {
+                cout << name << endl;
+                cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str<< "Peak Dynamic = " << power.readOp.dynamic*nocdynp.clockRate << " W" << endl;
+                cout << indent_str << "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str<< "Runtime Dynamic = " << rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
+                cout<<endl;
+
+                if (router_exist)
+                {
+                        cout << indent_str << "Router: " << endl;
+                        cout << indent_str_next << "Area = " << router->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next<< "Peak Dynamic = " << router->power.readOp.dynamic*nocdynp.clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? router->power.readOp.longer_channel_leakage:router->power.readOp.leakage)  <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << router->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next<< "Runtime Dynamic = " << router->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
+                        cout<<endl;
+                        if (plevel >2){
+                                cout << indent_str<< indent_str << "Virtual Channel Buffer:" << endl;
+                                cout << indent_str<< indent_str_next << "Area = " << router->buffer.area.get_area()*1e-6*nocdynp.input_ports<< " mm^2" << endl;
+                                cout << indent_str<< indent_str_next << "Peak Dynamic = " <<(router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)
+                                *nocdynp.min_ports*M*nocdynp.clockRate << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? router->buffer.power.readOp.longer_channel_leakage*nocdynp.input_ports:router->buffer.power.readOp.leakage*nocdynp.input_ports)  <<" W" << endl;
+                                cout << indent_str<< indent_str_next << "Gate Leakage = " << router->buffer.power.readOp.gate_leakage*nocdynp.input_ports << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->buffer.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
+                                cout <<endl;
+                                cout << indent_str<< indent_str<< "Crossbar:" << endl;
+                                cout << indent_str<< indent_str_next << "Area = " << router->crossbar.area.get_area()*1e-6  << " mm^2" << endl;
+                                cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->crossbar.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? router->crossbar.power.readOp.longer_channel_leakage:router->crossbar.power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Gate Leakage = " << router->crossbar.power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->crossbar.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
+                                cout <<endl;
+                                cout << indent_str<< indent_str<< "Arbiter:" << endl;
+                                cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->arbiter.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M  << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? router->arbiter.power.readOp.longer_channel_leakage:router->arbiter.power.readOp.leakage)  << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Gate Leakage = " << router->arbiter.power.readOp.gate_leakage  << " W" << endl;
+                                cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->arbiter.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
+                                cout <<endl;
+                        }
+                }
+                if (link_bus_exist)
+                {
+                        cout << indent_str << (nocdynp.type? "Per Router ":"") << link_name<<": " << endl;
+                        cout << indent_str_next << "Area = " << link_bus_tot_per_Router.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next<< "Peak Dynamic = " << link_bus_tot_per_Router.power.readOp.dynamic*
+                                nocdynp.clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? link_bus_tot_per_Router.power.readOp.longer_channel_leakage:link_bus_tot_per_Router.power.readOp.leakage)
+                             <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << link_bus_tot_per_Router.power.readOp.gate_leakage
+                                << " W" << endl;
+                        cout << indent_str_next<< "Runtime Dynamic = " << link_bus->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
+                        cout<<endl;
+
+                }
+        }
+        else
+        {
+//             cout << indent_str_next << "Instruction Fetch Unit    Peak Dynamic = " << ifu->rt_power.readOp.dynamic*clockRate << " W" << endl;
+//             cout << indent_str_next << "Instruction Fetch Unit    Subthreshold Leakage = " << ifu->rt_power.readOp.leakage <<" W" << endl;
+//             cout << indent_str_next << "Instruction Fetch Unit    Gate Leakage = " << ifu->rt_power.readOp.gate_leakage << " W" << endl;
+//             cout << indent_str_next << "Load Store Unit   Peak Dynamic = " << lsu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Load Store Unit   Subthreshold Leakage = " << lsu->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Load Store Unit   Gate Leakage = " << lsu->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Memory Management Unit   Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Memory Management Unit   Subthreshold Leakage = " << mmu->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Memory Management Unit   Gate Leakage = " << mmu->rt_power.readOp.gate_leakage  << " W" << endl;
+//             cout << indent_str_next << "Execution Unit   Peak Dynamic = " << exu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+//             cout << indent_str_next << "Execution Unit   Subthreshold Leakage = " << exu->rt_power.readOp.leakage  << " W" << endl;
+//             cout << indent_str_next << "Execution Unit   Gate Leakage = " << exu->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+}
+
+void NoC::set_noc_param()
+{
+
+        nocdynp.type            = XML->sys.NoC[ithNoC].type;
+        nocdynp.clockRate       =XML->sys.NoC[ithNoC].clockrate;
+        nocdynp.clockRate       *= 1e6;
+        nocdynp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+
+        nocdynp.flit_size     = XML->sys.NoC[ithNoC].flit_bits;
+        if (nocdynp.type)
+        {
+                nocdynp.input_ports   = XML->sys.NoC[ithNoC].input_ports;
+                nocdynp.output_ports  = XML->sys.NoC[ithNoC].output_ports;//later minus 1
+                nocdynp.min_ports     = min(nocdynp.input_ports,nocdynp.output_ports);
+                nocdynp.global_linked_ports = (nocdynp.input_ports-1) + (nocdynp.output_ports-1);
+                /*
+                 *     Except local i/o ports, all ports needs links( global_linked_ports);
+                 *  However only min_ports can be fully active simultaneously
+                 *  since the fewer number of ports (input or output ) is the bottleneck.
+                 */
+        }
+        else
+        {
+                nocdynp.input_ports   = 1;
+                nocdynp.output_ports  = 1;
+                nocdynp.min_ports     = min(nocdynp.input_ports,nocdynp.output_ports);
+                nocdynp.global_linked_ports = 1;
+        }
+
+        nocdynp.virtual_channel_per_port     = XML->sys.NoC[ithNoC].virtual_channel_per_port;
+        nocdynp.input_buffer_entries_per_vc  = XML->sys.NoC[ithNoC].input_buffer_entries_per_vc;
+
+        nocdynp.horizontal_nodes  = XML->sys.NoC[ithNoC].horizontal_nodes;
+        nocdynp.vertical_nodes    = XML->sys.NoC[ithNoC].vertical_nodes;
+        nocdynp.total_nodes       = nocdynp.horizontal_nodes*nocdynp.vertical_nodes;
+        nocdynp.duty_cycle        = XML->sys.NoC[ithNoC].duty_cycle;
+        nocdynp.has_global_link   = XML->sys.NoC[ithNoC].has_global_link;
+        nocdynp.link_throughput   = XML->sys.NoC[ithNoC].link_throughput;
+        nocdynp.link_latency      = XML->sys.NoC[ithNoC].link_latency;
+        nocdynp.chip_coverage     = XML->sys.NoC[ithNoC].chip_coverage;
+        nocdynp.route_over_perc   = XML->sys.NoC[ithNoC].route_over_perc;
+
+        assert (nocdynp.chip_coverage <=1);
+        assert (nocdynp.route_over_perc <=1);
+
+        if (nocdynp.type)
+                name = "NOC";
+        else
+                name = "BUSES";
+
+}
+
+
+NoC ::~NoC(){
+
+        if(router)                    {delete router; router = 0;}
+        if(link_bus)              {delete link_bus; link_bus = 0;}
+}
diff --git a/ext/mcpat/noc.h b/ext/mcpat/noc.h
new file mode 100644 (file)
index 0000000..31b5b3b
--- /dev/null
@@ -0,0 +1,75 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef NOC_H_
+#define NOC_H_
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_components.h"
+#include "interconnect.h"
+#include "logic.h"
+#include "parameter.h"
+#include "router.h"
+
+class NoC :public Component {
+  public:
+
+        ParseXML *XML;
+        int  ithNoC;
+        InputParameter interface_ip;
+        double link_len;
+        double executionTime;
+        double scktRatio, chip_PR_overhead, macro_PR_overhead;
+        Router * router;
+        interconnect * link_bus;
+        NoCParam  nocdynp;
+        uca_org_t local_result;
+        statsDef       tdp_stats;
+        statsDef       rtp_stats;
+        statsDef       stats_t;
+        powerDef       power_t;
+        Component      link_bus_tot_per_Router;
+        bool link_bus_exist;
+        bool router_exist;
+        string name, link_name;
+        double M_traffic_pattern;
+        NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_ = 0.6,double link_len_=0);
+        void set_noc_param();
+        void computeEnergy(bool is_tdp=true);
+        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        void init_link_bus(double link_len_);
+        void init_router();
+        void computeEnergy_link_bus(bool is_tdp=true);
+        void displayEnergy_link_bus(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+        ~NoC();
+};
+
+#endif /* NOC_H_ */
diff --git a/ext/mcpat/processor.cc b/ext/mcpat/processor.cc
new file mode 100644 (file)
index 0000000..8520c96
--- /dev/null
@@ -0,0 +1,839 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_circuit.h"
+#include "const.h"
+#include "parameter.h"
+#include "processor.h"
+#include "version.h"
+
+Processor::Processor(ParseXML *XML_interface)
+:XML(XML_interface),//TODO: using one global copy may have problems.
+ mc(0),
+ niu(0),
+ pcie(0),
+ flashcontroller(0)
+{
+  /*
+   *  placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
+   *  There is no point to have heterogeneous memory controller on chip,
+   *  thus McPAT only support homogeneous memory controllers.
+   */
+  int i;
+  double pppm_t[4]    = {1,1,1,1};
+  set_proc_param();
+  if (procdynp.homoCore)
+          numCore = procdynp.numCore==0? 0:1;
+  else
+          numCore = procdynp.numCore;
+
+  if (procdynp.homoL2)
+          numL2 = procdynp.numL2==0? 0:1;
+  else
+          numL2 = procdynp.numL2;
+
+  if (XML->sys.Private_L2 && numCore != numL2)
+  {
+          cout<<"Number of private L2 does not match number of cores"<<endl;
+      exit(0);
+  }
+
+  if (procdynp.homoL3)
+          numL3 = procdynp.numL3==0? 0:1;
+  else
+          numL3 = procdynp.numL3;
+
+  if (procdynp.homoNOC)
+          numNOC = procdynp.numNOC==0? 0:1;
+  else
+          numNOC = procdynp.numNOC;
+
+//  if (!procdynp.homoNOC)
+//  {
+//       cout<<"Current McPAT does not support heterogeneous NOC"<<endl;
+//      exit(0);
+//  }
+
+  if (procdynp.homoL1Dir)
+          numL1Dir = procdynp.numL1Dir==0? 0:1;
+  else
+          numL1Dir = procdynp.numL1Dir;
+
+  if (procdynp.homoL2Dir)
+          numL2Dir = procdynp.numL2Dir==0? 0:1;
+  else
+          numL2Dir = procdynp.numL2Dir;
+
+  for (i = 0;i < numCore; i++)
+  {
+                  cores.push_back(new Core(XML,i, &interface_ip));
+                  cores[i]->computeEnergy();
+                  cores[i]->computeEnergy(false);
+                  if (procdynp.homoCore){
+                          core.area.set_area(core.area.get_area() + cores[i]->area.get_area()*procdynp.numCore);
+                          set_pppm(pppm_t,cores[i]->clockRate*procdynp.numCore, procdynp.numCore,procdynp.numCore,procdynp.numCore);
+                          core.power = core.power + cores[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/cores[i]->executionTime, procdynp.numCore,procdynp.numCore,procdynp.numCore);
+                          core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t;
+                          area.set_area(area.get_area() + core.area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
+                          power = power  + core.power;
+                          rt_power = rt_power  + core.rt_power;
+                  }
+                  else{
+                          core.area.set_area(core.area.get_area() + cores[i]->area.get_area());
+                          area.set_area(area.get_area() + cores[i]->area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
+
+                          set_pppm(pppm_t,cores[i]->clockRate, 1, 1, 1);
+                          core.power = core.power + cores[i]->power*pppm_t;
+                          power = power  + cores[i]->power*pppm_t;
+
+                          set_pppm(pppm_t,1/cores[i]->executionTime, 1, 1, 1);
+                          core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t;
+                          rt_power = rt_power  + cores[i]->rt_power*pppm_t;
+                  }
+  }
+
+  if (!XML->sys.Private_L2)
+  {
+  if (numL2 >0)
+          for (i = 0;i < numL2; i++)
+          {
+                  l2array.push_back(new SharedCache(XML,i, &interface_ip));
+                  l2array[i]->computeEnergy();
+                  l2array[i]->computeEnergy(false);
+                  if (procdynp.homoL2){
+                          l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area()*procdynp.numL2);
+                          set_pppm(pppm_t,l2array[i]->cachep.clockRate*procdynp.numL2, procdynp.numL2,procdynp.numL2,procdynp.numL2);
+                          l2.power = l2.power + l2array[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, procdynp.numL2,procdynp.numL2,procdynp.numL2);
+                          l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t;
+                          area.set_area(area.get_area() + l2.area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm
+                          power = power  + l2.power;
+                          rt_power = rt_power  + l2.rt_power;
+                  }
+                  else{
+                          l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area());
+                          area.set_area(area.get_area() + l2array[i]->area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm
+
+                          set_pppm(pppm_t,l2array[i]->cachep.clockRate, 1, 1, 1);
+                          l2.power = l2.power + l2array[i]->power*pppm_t;
+                          power = power  + l2array[i]->power*pppm_t;;
+                          set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, 1, 1, 1);
+                          l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t;
+                          rt_power = rt_power  + l2array[i]->rt_power*pppm_t;
+                  }
+          }
+  }
+
+  if (numL3 >0)
+          for (i = 0;i < numL3; i++)
+          {
+                  l3array.push_back(new SharedCache(XML,i, &interface_ip, L3));
+                  l3array[i]->computeEnergy();
+                  l3array[i]->computeEnergy(false);
+                  if (procdynp.homoL3){
+                          l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area()*procdynp.numL3);
+                          set_pppm(pppm_t,l3array[i]->cachep.clockRate*procdynp.numL3, procdynp.numL3,procdynp.numL3,procdynp.numL3);
+                          l3.power = l3.power + l3array[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, procdynp.numL3,procdynp.numL3,procdynp.numL3);
+              l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t;
+                          area.set_area(area.get_area() + l3.area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm
+                          power = power  + l3.power;
+                          rt_power = rt_power  + l3.rt_power;
+
+                  }
+                  else{
+                          l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area());
+                          area.set_area(area.get_area() + l3array[i]->area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm
+                          set_pppm(pppm_t,l3array[i]->cachep.clockRate, 1, 1, 1);
+                          l3.power = l3.power + l3array[i]->power*pppm_t;
+                          power = power  + l3array[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, 1, 1, 1);
+              l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t;
+              rt_power = rt_power  + l3array[i]->rt_power*pppm_t;
+
+                  }
+          }
+  if (numL1Dir >0)
+          for (i = 0;i < numL1Dir; i++)
+          {
+                  l1dirarray.push_back(new SharedCache(XML,i, &interface_ip, L1Directory));
+                  l1dirarray[i]->computeEnergy();
+                  l1dirarray[i]->computeEnergy(false);
+                  if (procdynp.homoL1Dir){
+                          l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area()*procdynp.numL1Dir);
+                          set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate*procdynp.numL1Dir, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir);
+                          l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir);
+              l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t;
+                          area.set_area(area.get_area() + l1dir.area.get_area());//placement and routing overhead is 10%, l1dir scales worse than cache 40% is accumulated from 90 to 22nm
+                          power = power  + l1dir.power;
+                          rt_power = rt_power  + l1dir.rt_power;
+
+                  }
+                  else{
+                          l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area());
+                          area.set_area(area.get_area() + l1dirarray[i]->area.get_area());
+                          set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate, 1, 1, 1);
+                          l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t;
+                          power = power  + l1dirarray[i]->power;
+                          set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, 1, 1, 1);
+              l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t;
+                          rt_power = rt_power  + l1dirarray[i]->rt_power;
+                  }
+          }
+
+  if (numL2Dir >0)
+          for (i = 0;i < numL2Dir; i++)
+          {
+                  l2dirarray.push_back(new SharedCache(XML,i, &interface_ip, L2Directory));
+                  l2dirarray[i]->computeEnergy();
+                  l2dirarray[i]->computeEnergy(false);
+                  if (procdynp.homoL2Dir){
+                          l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area()*procdynp.numL2Dir);
+                          set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate*procdynp.numL2Dir, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir);
+                          l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir);
+              l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t;
+                          area.set_area(area.get_area() + l2dir.area.get_area());//placement and routing overhead is 10%, l2dir scales worse than cache 40% is accumulated from 90 to 22nm
+                          power = power  + l2dir.power;
+                          rt_power = rt_power  + l2dir.rt_power;
+
+                  }
+                  else{
+                          l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area());
+                          area.set_area(area.get_area() + l2dirarray[i]->area.get_area());
+                          set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate, 1, 1, 1);
+                          l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t;
+                          power = power  + l2dirarray[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, 1, 1, 1);
+              l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t;
+                          rt_power = rt_power  + l2dirarray[i]->rt_power*pppm_t;
+                  }
+          }
+
+  if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
+  {
+          mc = new MemoryController(XML, &interface_ip, MC);
+          mc->computeEnergy();
+          mc->computeEnergy(false);
+          mcs.area.set_area(mcs.area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs);
+          area.set_area(area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs);
+          set_pppm(pppm_t,XML->sys.mc.number_mcs*mc->mcp.clockRate, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs);
+          mcs.power = mc->power*pppm_t;
+          power = power  + mcs.power;
+          set_pppm(pppm_t,1/mc->mcp.executionTime, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs);
+          mcs.rt_power = mc->rt_power*pppm_t;
+          rt_power = rt_power  + mcs.rt_power;
+
+  }
+
+  if (XML->sys.flashc.number_mcs >0 )//flash controller
+  {
+          flashcontroller = new FlashController(XML, &interface_ip);
+          flashcontroller->computeEnergy();
+          flashcontroller->computeEnergy(false);
+          double number_fcs = flashcontroller->fcp.num_mcs;
+          flashcontrollers.area.set_area(flashcontrollers.area.get_area()+flashcontroller->area.get_area()*number_fcs);
+          area.set_area(area.get_area()+flashcontrollers.area.get_area());
+          set_pppm(pppm_t,number_fcs, number_fcs ,number_fcs, number_fcs );
+          flashcontrollers.power = flashcontroller->power*pppm_t;
+          power = power  + flashcontrollers.power;
+          set_pppm(pppm_t,number_fcs , number_fcs ,number_fcs ,number_fcs );
+          flashcontrollers.rt_power = flashcontroller->rt_power*pppm_t;
+          rt_power = rt_power  + flashcontrollers.rt_power;
+
+  }
+
+  if (XML->sys.niu.number_units >0)
+  {
+          niu = new NIUController(XML, &interface_ip);
+          niu->computeEnergy();
+          niu->computeEnergy(false);
+          nius.area.set_area(nius.area.get_area()+niu->area.get_area()*XML->sys.niu.number_units);
+          area.set_area(area.get_area()+niu->area.get_area()*XML->sys.niu.number_units);
+          set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
+          nius.power = niu->power*pppm_t;
+          power = power  + nius.power;
+          set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
+          nius.rt_power = niu->rt_power*pppm_t;
+          rt_power = rt_power  + nius.rt_power;
+
+  }
+
+  if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels >0)
+  {
+          pcie = new PCIeController(XML, &interface_ip);
+          pcie->computeEnergy();
+          pcie->computeEnergy(false);
+          pcies.area.set_area(pcies.area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units);
+          area.set_area(area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units);
+          set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
+          pcies.power = pcie->power*pppm_t;
+          power = power  + pcies.power;
+          set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
+          pcies.rt_power = pcie->rt_power*pppm_t;
+          rt_power = rt_power  + pcies.rt_power;
+
+  }
+
+  if (numNOC >0)
+  {
+          for (i = 0;i < numNOC; i++)
+          {
+                  if (XML->sys.NoC[i].type)
+                  {//First add up area of routers if NoC is used
+                          nocs.push_back(new NoC(XML,i, &interface_ip, 1));
+                          if (procdynp.homoNOC)
+                          {
+                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC);
+                                  area.set_area(area.get_area() + noc.area.get_area());
+                          }
+                          else
+                          {
+                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
+                                  area.set_area(area.get_area() + nocs[i]->area.get_area());
+                          }
+                  }
+                  else
+                  {//Bus based interconnect
+                          nocs.push_back(new NoC(XML,i, &interface_ip, 1, sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage)));
+                          if (procdynp.homoNOC){
+                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC);
+                                  area.set_area(area.get_area() + noc.area.get_area());
+                          }
+                          else
+                          {
+                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
+                                  area.set_area(area.get_area() + nocs[i]->area.get_area());
+                          }
+                  }
+          }
+
+          /*
+           * Compute global links associated with each NOC, if any. This must be done at the end (even after the NOC router part) since the total chip
+           * area must be obtain to decide the link routing
+           */
+          for (i = 0;i < numNOC; i++)
+          {
+                  if (nocs[i]->nocdynp.has_global_link && XML->sys.NoC[i].type)
+                  {
+                          nocs[i]->init_link_bus(sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage));//compute global links
+                          if (procdynp.homoNOC)
+                          {
+                                  noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
+                                                  * nocs[i]->nocdynp.total_nodes
+                                                  * procdynp.numNOC);
+                                  area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
+                                                  * nocs[i]->nocdynp.total_nodes
+                                                  * procdynp.numNOC);
+                          }
+                          else
+                          {
+                                  noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
+                                                  * nocs[i]->nocdynp.total_nodes);
+                                  area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
+                                                  * nocs[i]->nocdynp.total_nodes);
+                          }
+                  }
+          }
+          //Compute energy of NoC (w or w/o links) or buses
+          for (i = 0;i < numNOC; i++)
+          {
+                  nocs[i]->computeEnergy();
+                  nocs[i]->computeEnergy(false);
+                  if (procdynp.homoNOC){
+                          set_pppm(pppm_t,procdynp.numNOC*nocs[i]->nocdynp.clockRate, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC);
+                          noc.power = noc.power + nocs[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC);
+                          noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t;
+                          power = power  + noc.power;
+                          rt_power = rt_power  + noc.rt_power;
+                  }
+                  else
+                  {
+                          set_pppm(pppm_t,nocs[i]->nocdynp.clockRate, 1, 1, 1);
+                          noc.power = noc.power + nocs[i]->power*pppm_t;
+                          power = power  + nocs[i]->power*pppm_t;
+                          set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, 1, 1, 1);
+                          noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t;
+                          rt_power = rt_power  + nocs[i]->rt_power*pppm_t;
+
+
+                  }
+          }
+  }
+
+//  //clock power
+//  globalClock.init_wire_external(is_default, &interface_ip);
+//  globalClock.clk_area           =area*1e6; //change it from mm^2 to um^2
+//  globalClock.end_wiring_level   =5;//toplevel metal
+//  globalClock.start_wiring_level =5;//toplevel metal
+//  globalClock.l_ip.with_clock_grid=false;//global clock does not drive local final nodes
+//  globalClock.optimize_wire();
+
+}
+
+void Processor::displayDeviceType(int device_type_, uint32_t indent)
+{
+        string indent_str(indent, ' ');
+
+        switch ( device_type_ ) {
+
+          case 0 :
+                  cout <<indent_str<<"Device Type= "<<"ITRS high performance device type"<<endl;
+            break;
+          case 1 :
+                  cout <<indent_str<<"Device Type= "<<"ITRS low standby power device type"<<endl;
+            break;
+          case 2 :
+                  cout <<indent_str<<"Device Type= "<<"ITRS low operating power device type"<<endl;
+            break;
+          case 3 :
+                  cout <<indent_str<<"Device Type= "<<"LP-DRAM device type"<<endl;
+            break;
+          case 4 :
+                  cout <<indent_str<<"Device Type= "<<"COMM-DRAM device type"<<endl;
+            break;
+          default :
+                  {
+                          cout <<indent_str<<"Unknown Device Type"<<endl;
+                          exit(0);
+                  }
+        }
+}
+
+void Processor::displayInterconnectType(int interconnect_type_, uint32_t indent)
+{
+        string indent_str(indent, ' ');
+
+        switch ( interconnect_type_ ) {
+
+          case 0 :
+                  cout <<indent_str<<"Interconnect metal projection= "<<"aggressive interconnect technology projection"<<endl;
+            break;
+          case 1 :
+                  cout <<indent_str<<"Interconnect metal projection= "<<"conservative interconnect technology projection"<<endl;
+            break;
+          default :
+                  {
+                          cout <<indent_str<<"Unknown Interconnect Projection Type"<<endl;
+                          exit(0);
+                  }
+        }
+}
+
+void Processor::displayEnergy(uint32_t indent, int plevel, bool is_tdp)
+{
+        int i;
+        bool long_channel = XML->sys.longer_channel_device;
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        if (is_tdp)
+        {
+
+                if (plevel<5)
+                {
+                        cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
+                                        << " of " << VER_UPDATE << ") results (current print level is "<< plevel
+                        <<", please increase print level to see the details in components): "<<endl;
+                }
+                else
+                {
+                        cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
+                                                                << " of " << VER_UPDATE << ") results  (current print level is 5)"<< endl;
+                }
+                cout <<"*****************************************************************************************"<<endl;
+                cout <<indent_str<<"Technology "<<XML->sys.core_tech_node<<" nm"<<endl;
+                //cout <<indent_str<<"Device Type= "<<XML->sys.device_type<<endl;
+                if (long_channel)
+                        cout <<indent_str<<"Using Long Channel Devices When Appropriate"<<endl;
+                //cout <<indent_str<<"Interconnect metal projection= "<<XML->sys.interconnect_projection_type<<endl;
+                displayInterconnectType(XML->sys.interconnect_projection_type, indent);
+                cout <<indent_str<<"Core clock Rate(MHz) "<<XML->sys.core[0].clock_rate<<endl;
+        cout <<endl;
+                cout <<"*****************************************************************************************"<<endl;
+                cout <<"Processor: "<<endl;
+                cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Power = " << power.readOp.dynamic +
+                        (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl;
+                cout << indent_str << "Total Leakage = " <<
+                        (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;
+                cout << indent_str << "Subthreshold Leakage = " << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
+                cout <<endl;
+                if (numCore >0){
+                cout <<indent_str<<"Total Cores: "<<XML->sys.number_of_cores << " cores "<<endl;
+                displayDeviceType(XML->sys.device_type,indent);
+                cout << indent_str_next << "Area = " << core.area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << core.power.readOp.dynamic << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? core.power.readOp.longer_channel_leakage:core.power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str_next << "Subthreshold Leakage = " << core.power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << core.power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << core.rt_power.readOp.dynamic << " W" << endl;
+                cout <<endl;
+                }
+                if (!XML->sys.Private_L2)
+                {
+                        if (numL2 >0){
+                                cout <<indent_str<<"Total L2s: "<<endl;
+                                displayDeviceType(XML->sys.L2[0].device_type,indent);
+                                cout << indent_str_next << "Area = " << l2.area.get_area()*1e-6<< " mm^2" << endl;
+                                cout << indent_str_next << "Peak Dynamic = " << l2.power.readOp.dynamic << " W" << endl;
+                                cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? l2.power.readOp.longer_channel_leakage:l2.power.readOp.leakage) <<" W" << endl;
+                                //cout << indent_str_next << "Subthreshold Leakage = " << l2.power.readOp.longer_channel_leakage <<" W" << endl;
+                                cout << indent_str_next << "Gate Leakage = " << l2.power.readOp.gate_leakage << " W" << endl;
+                                cout << indent_str_next << "Runtime Dynamic = " << l2.rt_power.readOp.dynamic << " W" << endl;
+                                cout <<endl;
+                        }
+                }
+                if (numL3 >0){
+                        cout <<indent_str<<"Total L3s: "<<endl;
+                        displayDeviceType(XML->sys.L3[0].device_type, indent);
+                        cout << indent_str_next << "Area = " << l3.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << l3.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? l3.power.readOp.longer_channel_leakage:l3.power.readOp.leakage) <<" W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << l3.power.readOp.longer_channel_leakage <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << l3.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << l3.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (numL1Dir >0){
+                        cout <<indent_str<<"Total First Level Directory: "<<endl;
+                        displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
+                        cout << indent_str_next << "Area = " << l1dir.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << l1dir.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? l1dir.power.readOp.longer_channel_leakage:l1dir.power.readOp.leakage) <<" W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << l1dir.power.readOp.longer_channel_leakage <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << l1dir.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << l1dir.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (numL2Dir >0){
+                        cout <<indent_str<<"Total First Level Directory: "<<endl;
+                        displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
+                        cout << indent_str_next << "Area = " << l2dir.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << l2dir.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? l2dir.power.readOp.longer_channel_leakage:l2dir.power.readOp.leakage) <<" W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << l2dir.power.readOp.longer_channel_leakage <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << l2dir.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << l2dir.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (numNOC >0){
+                        cout <<indent_str<<"Total NoCs (Network/Bus): "<<endl;
+                        displayDeviceType(XML->sys.device_type, indent);
+                        cout << indent_str_next << "Area = " << noc.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << noc.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? noc.power.readOp.longer_channel_leakage:noc.power.readOp.leakage) <<" W" << endl;
+                        //cout << indent_str_next << "Subthreshold Leakage = " << noc.power.readOp.longer_channel_leakage  <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << noc.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << noc.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
+                {
+                        cout <<indent_str<<"Total MCs: "<<XML->sys.mc.number_mcs << " Memory Controllers "<<endl;
+                        displayDeviceType(XML->sys.device_type, indent);
+                        cout << indent_str_next << "Area = " << mcs.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << mcs.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? mcs.power.readOp.longer_channel_leakage:mcs.power.readOp.leakage)  <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << mcs.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << mcs.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (XML->sys.flashc.number_mcs >0)
+                {
+                        cout <<indent_str<<"Total Flash/SSD Controllers: "<<flashcontroller->fcp.num_mcs << " Flash/SSD Controllers "<<endl;
+                        displayDeviceType(XML->sys.device_type, indent);
+                        cout << indent_str_next << "Area = " << flashcontrollers.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << flashcontrollers.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? flashcontrollers.power.readOp.longer_channel_leakage:flashcontrollers.power.readOp.leakage)  <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << flashcontrollers.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << flashcontrollers.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (XML->sys.niu.number_units >0 )
+                {
+                        cout <<indent_str<<"Total NIUs: "<<niu->niup.num_units << " Network Interface Units "<<endl;
+                        displayDeviceType(XML->sys.device_type, indent);
+                        cout << indent_str_next << "Area = " << nius.area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << nius.power.readOp.dynamic << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                                << (long_channel? nius.power.readOp.longer_channel_leakage:nius.power.readOp.leakage)  <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << nius.power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << nius.rt_power.readOp.dynamic << " W" << endl;
+                        cout <<endl;
+                }
+                if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0)
+                                {
+                                        cout <<indent_str<<"Total PCIes: "<<pcie->pciep.num_units << " PCIe Controllers "<<endl;
+                                        displayDeviceType(XML->sys.device_type, indent);
+                                        cout << indent_str_next << "Area = " << pcies.area.get_area()*1e-6<< " mm^2" << endl;
+                                        cout << indent_str_next << "Peak Dynamic = " << pcies.power.readOp.dynamic << " W" << endl;
+                                        cout << indent_str_next << "Subthreshold Leakage = "
+                                                << (long_channel? pcies.power.readOp.longer_channel_leakage:pcies.power.readOp.leakage)  <<" W" << endl;
+                                        cout << indent_str_next << "Gate Leakage = " << pcies.power.readOp.gate_leakage << " W" << endl;
+                                        cout << indent_str_next << "Runtime Dynamic = " << pcies.rt_power.readOp.dynamic << " W" << endl;
+                                        cout <<endl;
+                                }
+                cout <<"*****************************************************************************************"<<endl;
+                if (plevel >1)
+                {
+                        for (i = 0;i < numCore; i++)
+                        {
+                                cores[i]->displayEnergy(indent+4,plevel,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        if (!XML->sys.Private_L2)
+                        {
+                                for (i = 0;i < numL2; i++)
+                                {
+                                        l2array[i]->displayEnergy(indent+4,is_tdp);
+                                        cout <<"*****************************************************************************************"<<endl;
+                                }
+                        }
+                        for (i = 0;i < numL3; i++)
+                        {
+                                l3array[i]->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        for (i = 0;i < numL1Dir; i++)
+                        {
+                                l1dirarray[i]->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        for (i = 0;i < numL2Dir; i++)
+                        {
+                                l2dirarray[i]->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
+                        {
+                                mc->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        if (XML->sys.flashc.number_mcs >0 && XML->sys.flashc.memory_channels_per_mc>0)
+                        {
+                                flashcontroller->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        if (XML->sys.niu.number_units >0 )
+                        {
+                                niu->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                        if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0)
+                        {
+                                pcie->displayEnergy(indent+4,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+
+                        for (i = 0;i < numNOC; i++)
+                        {
+                                nocs[i]->displayEnergy(indent+4,plevel,is_tdp);
+                                cout <<"*****************************************************************************************"<<endl;
+                        }
+                }
+        }
+        else
+        {
+
+        }
+
+}
+
+void Processor::set_proc_param()
+{
+        bool debug = false;
+
+        procdynp.homoCore = bool(debug?1:XML->sys.homogeneous_cores);
+        procdynp.homoL2   = bool(debug?1:XML->sys.homogeneous_L2s);
+        procdynp.homoL3   = bool(debug?1:XML->sys.homogeneous_L3s);
+        procdynp.homoNOC  = bool(debug?1:XML->sys.homogeneous_NoCs);
+        procdynp.homoL1Dir  = bool(debug?1:XML->sys.homogeneous_L1Directories);
+        procdynp.homoL2Dir  = bool(debug?1:XML->sys.homogeneous_L2Directories);
+
+        procdynp.numCore = XML->sys.number_of_cores;
+        procdynp.numL2   = XML->sys.number_of_L2s;
+        procdynp.numL3   = XML->sys.number_of_L3s;
+        procdynp.numNOC  = XML->sys.number_of_NoCs;
+        procdynp.numL1Dir  = XML->sys.number_of_L1Directories;
+        procdynp.numL2Dir  = XML->sys.number_of_L2Directories;
+        procdynp.numMC = XML->sys.mc.number_mcs;
+        procdynp.numMCChannel = XML->sys.mc.memory_channels_per_mc;
+
+//     if (procdynp.numCore<1)
+//     {
+//             cout<<" The target processor should at least have one core on chip." <<endl;
+//             exit(0);
+//     }
+
+        //  if (numNOCs<0 || numNOCs>2)
+        //    {
+        //       cout <<"number of NOCs must be 1 (only global NOCs) or 2 (both global and local NOCs)"<<endl;
+        //       exit(0);
+        //    }
+
+        /* Basic parameters*/
+        interface_ip.data_arr_ram_cell_tech_type    = debug?0:XML->sys.device_type;
+        interface_ip.data_arr_peri_global_tech_type = debug?0:XML->sys.device_type;
+        interface_ip.tag_arr_ram_cell_tech_type     = debug?0:XML->sys.device_type;
+        interface_ip.tag_arr_peri_global_tech_type  = debug?0:XML->sys.device_type;
+
+        interface_ip.ic_proj_type     = debug?0:XML->sys.interconnect_projection_type;
+        interface_ip.delay_wt                = 100;//Fixed number, make sure timing can be satisfied.
+        interface_ip.area_wt                 = 0;//Fixed number, This is used to exhaustive search for individual components.
+        interface_ip.dynamic_power_wt        = 100;//Fixed number, This is used to exhaustive search for individual components.
+        interface_ip.leakage_power_wt        = 0;
+        interface_ip.cycle_time_wt           = 0;
+
+        interface_ip.delay_dev                = 10000;//Fixed number, make sure timing can be satisfied.
+        interface_ip.area_dev                 = 10000;//Fixed number, This is used to exhaustive search for individual components.
+        interface_ip.dynamic_power_dev        = 10000;//Fixed number, This is used to exhaustive search for individual components.
+        interface_ip.leakage_power_dev        = 10000;
+        interface_ip.cycle_time_dev           = 10000;
+
+        interface_ip.ed                       = 2;
+        interface_ip.burst_len      = 1;//parameters are fixed for processor section, since memory is processed separately
+        interface_ip.int_prefetch_w = 1;
+        interface_ip.page_sz_bits   = 0;
+        interface_ip.temp = debug?360: XML->sys.temperature;
+        interface_ip.F_sz_nm         = debug?90:XML->sys.core_tech_node;//XML->sys.core_tech_node;
+        interface_ip.F_sz_um         = interface_ip.F_sz_nm / 1000;
+
+        //***********This section of code does not have real meaning, they are just to ensure all data will have initial value to prevent errors.
+        //They will be overridden  during each components initialization
+        interface_ip.cache_sz            =64;
+        interface_ip.line_sz             = 1;
+        interface_ip.assoc               = 1;
+        interface_ip.nbanks              = 1;
+        interface_ip.out_w               = interface_ip.line_sz*8;
+        interface_ip.specific_tag        = 1;
+        interface_ip.tag_w               = 64;
+        interface_ip.access_mode         = 2;
+
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power  = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t    = 1;
+
+        interface_ip.is_main_mem     = false;
+        interface_ip.rpters_in_htree = true ;
+        interface_ip.ver_htree_wires_over_array = 0;
+        interface_ip.broadcast_addr_din_over_ver_htrees = 0;
+
+        interface_ip.num_rw_ports        = 1;
+        interface_ip.num_rd_ports        = 0;
+        interface_ip.num_wr_ports        = 0;
+        interface_ip.num_se_rd_ports     = 0;
+        interface_ip.num_search_ports    = 1;
+        interface_ip.nuca                = 0;
+        interface_ip.nuca_bank_count     = 0;
+        interface_ip.is_cache            =true;
+        interface_ip.pure_ram            =false;
+        interface_ip.pure_cam            =false;
+        interface_ip.force_cache_config  =false;
+        if (XML->sys.Embedded)
+                {
+                interface_ip.wt                  =Global_30;
+                interface_ip.wire_is_mat_type = 0;
+                interface_ip.wire_os_mat_type = 0;
+                }
+        else
+                {
+                interface_ip.wt                  =Global;
+                interface_ip.wire_is_mat_type = 2;
+                interface_ip.wire_os_mat_type = 2;
+                }
+        interface_ip.force_wiretype      = false;
+        interface_ip.print_detail        = 1;
+        interface_ip.add_ecc_b_          =true;
+}
+
+Processor::~Processor(){
+        while (!cores.empty())
+        {
+                delete cores.back();
+                cores.pop_back();
+        }
+        while (!l2array.empty())
+        {
+                delete l2array.back();
+                l2array.pop_back();
+        }
+        while (!l3array.empty())
+        {
+                delete l3array.back();
+                l3array.pop_back();
+        }
+        while (!nocs.empty())
+        {
+                delete nocs.back();
+                nocs.pop_back();
+        }
+        if (!mc)
+        {
+                delete mc;
+        }
+        if (!niu)
+        {
+                delete niu;
+        }
+        if (!pcie)
+        {
+                delete pcie;
+        }
+        if (!flashcontroller)
+        {
+                delete flashcontroller;
+        }
+};
diff --git a/ext/mcpat/processor.h b/ext/mcpat/processor.h
new file mode 100644 (file)
index 0000000..5a7a2f7
--- /dev/null
@@ -0,0 +1,79 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#ifndef PROCESSOR_H_
+#define PROCESSOR_H_
+
+#include <vector>
+
+#include "XML_Parse.h"
+#include "arbiter.h"
+#include "area.h"
+#include "array.h"
+#include "basic_components.h"
+#include "core.h"
+#include "decoder.h"
+#include "iocontrollers.h"
+#include "memoryctrl.h"
+#include "noc.h"
+#include "parameter.h"
+#include "router.h"
+#include "sharedcache.h"
+
+class Processor : public Component
+{
+  public:
+        ParseXML *XML;
+        vector<Core *> cores;
+    vector<SharedCache *> l2array;
+    vector<SharedCache *> l3array;
+    vector<SharedCache *> l1dirarray;
+    vector<SharedCache *> l2dirarray;
+    vector<NoC *>  nocs;
+    MemoryController * mc;
+    NIUController    * niu;
+    PCIeController   * pcie;
+    FlashController  * flashcontroller;
+    InputParameter interface_ip;
+    ProcParam procdynp;
+    //wire     globalInterconnect;
+    //clock_network globalClock;
+    Component core, l2, l3, l1dir, l2dir, noc, mcs, cc, nius, pcies,flashcontrollers;
+    int  numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir;
+    Processor(ParseXML *XML_interface);
+    void compute();
+    void set_proc_param();
+    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
+    void displayDeviceType(int device_type_, uint32_t indent = 0);
+    void displayInterconnectType(int interconnect_type_, uint32_t indent = 0);
+    ~Processor();
+};
+
+#endif /* PROCESSOR_H_ */
diff --git a/ext/mcpat/results/A9_2000 b/ext/mcpat/results/A9_2000
new file mode 100644 (file)
index 0000000..e91243f
--- /dev/null
@@ -0,0 +1,321 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 40 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= conservative interconnect technology projection
+  Core clock Rate(MHz) 2000
+
+*****************************************************************************************
+Processor: 
+  Area = 5.83937 mm^2
+  Peak Power = 1.32283 W
+  Total Leakage = 0.182558 W
+  Peak Dynamic = 1.14027 W
+  Subthreshold Leakage = 0.0869601 W
+  Gate Leakage = 0.095598 W
+  Runtime Dynamic = 2.86361 W
+
+  Total Cores: 
+  Device Type= ITRS low operating power device type
+    Area = 5.33485 mm^2
+    Peak Dynamic = 1.07823 W
+    Subthreshold Leakage = 0.0827641 W
+    Gate Leakage = 0.0887315 W
+    Runtime Dynamic = 0.975395 W
+
+  Total First Level Directory: 
+  Device Type= ITRS low operating power device type
+    Area = 0.489711 mm^2
+    Peak Dynamic = 0.0449752 W
+    Subthreshold Leakage = 0.00397708 W
+    Gate Leakage = 0.00655632 W
+    Runtime Dynamic = 1.80289 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS low operating power device type
+    Area = 0.0148119 mm^2
+    Peak Dynamic = 0.0170648 W
+    Subthreshold Leakage = 0.000218992 W
+    Gate Leakage = 0.000310207 W
+    Runtime Dynamic = 0.0853239 W
+
+*****************************************************************************************
+Core:
+      Area = 2.66742 mm^2
+      Peak Dynamic = 0.539116 W
+      Subthreshold Leakage = 0.041382 W
+      Gate Leakage = 0.0443657 W
+      Runtime Dynamic = 0.975395 W
+
+      Instruction Fetch Unit:
+        Area = 0.565848 mm^2
+        Peak Dynamic = 0.184724 W
+        Subthreshold Leakage = 0.00572394 W
+        Gate Leakage = 0.00380598 W
+        Runtime Dynamic = 0.283222 W
+
+          Instruction Cache:
+            Area = 0.235613 mm^2
+            Peak Dynamic = 0.0310428 W
+            Subthreshold Leakage = 0.00309635 W
+            Gate Leakage = 0.00216385 W
+            Runtime Dynamic = 0.0461626 W
+
+          Branch Target Buffer:
+            Area = 0.251259 mm^2
+            Peak Dynamic = 0.0174433 W
+            Subthreshold Leakage = 0.00170231 W
+            Gate Leakage = 0.000908123 W
+            Runtime Dynamic = 0.0697733 W
+
+          Branch Predictor:
+            Area = 0.064441 mm^2
+            Peak Dynamic = 0.00815792 W
+            Subthreshold Leakage = 0.00070444 W
+            Gate Leakage = 0.000477387 W
+            Runtime Dynamic = 0.0113878 W
+
+              Global Predictor:
+                Area = 0.0313969 mm^2
+                Peak Dynamic = 0.00374527 W
+                Subthreshold Leakage = 0.00034631 W
+                Gate Leakage = 0.000233555 W
+                Runtime Dynamic = 0.00545806 W
+
+              Local Predictor:
+                Area = 0.000711939 mm^2
+                Peak Dynamic = 0.000301014 W
+                Subthreshold Leakage = 6.13457e-06 W
+                Gate Leakage = 5.63471e-06 W
+                Runtime Dynamic = 0.000471566 W
+
+                Area = 0.000650815 mm^2
+                Peak Dynamic = 0.000230123 W
+                Subthreshold Leakage = 5.7769e-06 W
+                Gate Leakage = 4.75075e-06 W
+                Runtime Dynamic = 0.000354988 W
+
+              Chooser:
+                Area = 0.0313969 mm^2
+                Peak Dynamic = 0.00374527 W
+                Subthreshold Leakage = 0.00034631 W
+                Gate Leakage = 0.000233555 W
+                Runtime Dynamic = 0.00545806 W
+
+              RAS:
+                Area = 0.000996272 mm^2
+                Peak Dynamic = 0.000366372 W
+                Subthreshold Leakage = 5.68653e-06 W
+                Gate Leakage = 4.64147e-06 W
+                Runtime Dynamic = 6.23994e-08 W
+
+          Instruction Buffer:
+            Area = 0.00820192 mm^2
+            Peak Dynamic = 0.0669878 W
+            Subthreshold Leakage = 6.33536e-05 W
+            Gate Leakage = 4.34841e-05 W
+            Runtime Dynamic = 0.0382787 W
+
+          Instruction Decoder:
+            Area = 0.00468731 mm^2
+            Peak Dynamic = 0.05881 W
+            Subthreshold Leakage = 0.000127696 W
+            Gate Leakage = 0.000115494 W
+            Runtime Dynamic = 0.11762 W
+
+      Renaming Unit:
+        Area = 0.0903068 mm^2
+        Peak Dynamic = 0.0451514 W
+        Subthreshold Leakage = 0.000345688 W
+        Gate Leakage = 0.00032022 W
+        Runtime Dynamic = 0.0731287 W
+
+          Int Front End RAT:
+            Area = 0.0543672 mm^2
+            Peak Dynamic = 0.0237617 W
+            Subthreshold Leakage = 0.000175223 W
+            Gate Leakage = 0.000121525 W
+            Runtime Dynamic = 0.0475234 W
+
+          FP Front End RAT:
+            Area = 0.0185325 mm^2
+            Peak Dynamic = 0.00949419 W
+            Subthreshold Leakage = 0.000100325 W
+            Gate Leakage = 6.76251e-05 W
+            Runtime Dynamic = 0.00949419 W
+
+          Free List:
+            Area = 0.00599955 mm^2
+            Peak Dynamic = 0.00225065 W
+            Subthreshold Leakage = 1.24363e-05 W
+            Gate Leakage = 1.00844e-05 W
+            Runtime Dynamic = 0.0090026 W
+
+          Int Retire RAT: 
+            Area = 0.00605969 mm^2
+            Peak Dynamic = 0.00448392 W
+            Subthreshold Leakage = 1.33231e-05 W
+            Gate Leakage = 1.16235e-05 W
+            Runtime Dynamic = 0.00448392 W
+
+          FP Retire RAT:
+            Area = 0.000650815 mm^2
+            Peak Dynamic = 0.00067334 W
+            Subthreshold Leakage = 5.7769e-06 W
+            Gate Leakage = 4.75075e-06 W
+            Runtime Dynamic = 0.00067334 W
+
+          FP Free List:
+            Area = 0.00305098 mm^2
+            Peak Dynamic = 0.00195124 W
+            Subthreshold Leakage = 8.81712e-06 W
+            Gate Leakage = 6.96054e-06 W
+            Runtime Dynamic = 0.00195124 W
+
+      Load Store Unit:
+        Area = 0.274913 mm^2
+        Peak Dynamic = 0.0347482 W
+        Subthreshold Leakage = 0.0032012 W
+        Gate Leakage = 0.00235752 W
+        Runtime Dynamic = 0.195304 W
+
+          Data Cache:
+            Area = 0.240878 mm^2
+            Peak Dynamic = 0.0293665 W
+            Subthreshold Leakage = 0.00312878 W
+            Gate Leakage = 0.00220794 W
+            Runtime Dynamic = 0.19026 W
+
+          StoreQ:
+            Area = 0.00754674 mm^2
+            Peak Dynamic = 0.00358087 W
+            Subthreshold Leakage = 4.2633e-05 W
+            Gate Leakage = 5.19212e-05 W
+            Runtime Dynamic = 0.00504348 W
+
+      Memory Management Unit:
+        Area = 0.021508 mm^2
+        Peak Dynamic = 0.0127337 W
+        Subthreshold Leakage = 0.000210621 W
+        Gate Leakage = 0.000290666 W
+        Runtime Dynamic = 0.037071 W
+
+          Itlb:
+            Area = 0.00993091 mm^2
+            Peak Dynamic = 0.00617846 W
+            Subthreshold Leakage = 9.04168e-05 W
+            Gate Leakage = 9.65082e-05 W
+            Runtime Dynamic = 0.012357 W
+
+          Dtlb:
+            Area = 0.00993091 mm^2
+            Peak Dynamic = 0.00438671 W
+            Subthreshold Leakage = 9.04168e-05 W
+            Gate Leakage = 9.65082e-05 W
+            Runtime Dynamic = 0.0247139 W
+
+      Execution Unit:
+        Area = 1.65498 mm^2
+        Peak Dynamic = 0.261758 W
+        Subthreshold Leakage = 0.0305522 W
+        Gate Leakage = 0.0360036 W
+        Runtime Dynamic = 0.386669 W
+
+          Register Files:
+            Area = 0.203203 mm^2
+            Peak Dynamic = 0.0763282 W
+            Subthreshold Leakage = 0.000197046 W
+            Gate Leakage = 0.00016338 W
+            Runtime Dynamic = 0.0386066 W
+
+              Integer RF:
+                Area = 0.146073 mm^2
+                Peak Dynamic = 0.0763282 W
+                Subthreshold Leakage = 0.000120303 W
+                Gate Leakage = 9.97867e-05 W
+                Runtime Dynamic = 0.0345689 W
+
+              Floating Point RF:
+                Area = 0.05713 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 7.67427e-05 W
+                Gate Leakage = 6.35938e-05 W
+                Runtime Dynamic = 0.00403765 W
+
+          Instruction Scheduler:
+            Area = 0.0582889 mm^2
+            Peak Dynamic = 0.0522571 W
+            Subthreshold Leakage = 0.000128698 W
+            Gate Leakage = 0.000185714 W
+            Runtime Dynamic = 0.0787473 W
+
+              Instruction Window:
+                Area = 0.053925 mm^2
+                Peak Dynamic = 0.0445895 W
+                Subthreshold Leakage = 9.52936e-05 W
+                Gate Leakage = 0.000130718 W
+                Runtime Dynamic = 0.0602231 W
+
+              FP Instruction Window:
+                Area = 0.00436388 mm^2
+                Peak Dynamic = 0.00766759 W
+                Subthreshold Leakage = 3.34043e-05 W
+                Gate Leakage = 5.49962e-05 W
+                Runtime Dynamic = 0.0185242 W
+
+          Integer ALUs (Count: 3 ):
+            Area = 0.312404 mm^2
+            Peak Dynamic = 0.0283684 W
+            Subthreshold Leakage = 0.0140724 W
+            Gate Leakage = 0.0165703 W
+            Runtime Dynamic = 0.0373268 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 0.971259 mm^2
+            Peak Dynamic = 0 W
+            Subthreshold Leakage = 0.0109377 W
+            Gate Leakage = 0.0128792 W
+            Runtime Dynamic = 0.0373268 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.104135 mm^2
+            Peak Dynamic = 0.0204053 W
+            Subthreshold Leakage = 0.00469079 W
+            Gate Leakage = 0.00552345 W
+            Runtime Dynamic = 0.049769 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00404385 mm^2
+            Peak Dynamic = 0.0824719 W
+            Subthreshold Leakage = 0.000495836 W
+            Gate Leakage = 0.000583852 W
+            Runtime Dynamic = 0.144892 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 0.244856 mm^2
+      Peak Dynamic = 0.0224876 W
+      Subthreshold Leakage = 0.00198854 W
+      Gate Leakage = 0.00327816 W
+      Runtime Dynamic = 1.80289 W
+
+*****************************************************************************************
+BUSES
+      Area = 0.0148119 mm^2
+      Peak Dynamic = 0.0170648 W
+      Subthreshold Leakage = 0.000218992 W
+      Gate Leakage = 0.000310207 W
+      Runtime Dynamic = 0.0853239 W
+
+      Bus: 
+        Area = 0.0148119 mm^2
+        Peak Dynamic = 0.0170648 W
+        Subthreshold Leakage = 0.000218992 W
+        Gate Leakage = 0.000310207 W
+        Runtime Dynamic = 0.0853239 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/A9_2000_withIOC b/ext/mcpat/results/A9_2000_withIOC
new file mode 100644 (file)
index 0000000..b475093
--- /dev/null
@@ -0,0 +1,410 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
+SerDer_dyn 0.00216115
+ctrl_dyn 0.0278216
+ctrl_dyn 6.14856e-11
+SerDer_dyn 1.54368e-11
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 40 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= conservative interconnect technology projection
+  Core clock Rate(MHz) 2000
+
+*****************************************************************************************
+Processor: 
+  Area = 7.05775 mm^2
+  Peak Power = 2.06734 W
+  Total Leakage = 0.204814 W
+  Peak Dynamic = 1.86253 W
+  Subthreshold Leakage = 0.0916805 W
+  Gate Leakage = 0.113134 W
+  Runtime Dynamic = 5.3744 W
+
+  Total Cores: 2 cores 
+  Device Type= ITRS low operating power device type
+    Area = 5.33485 mm^2
+    Peak Dynamic = 1.07823 W
+    Subthreshold Leakage = 0.0827641 W
+    Gate Leakage = 0.0887315 W
+    Runtime Dynamic = 0.975395 W
+
+  Total First Level Directory: 
+  Device Type= ITRS low operating power device type
+    Area = 0.489711 mm^2
+    Peak Dynamic = 0.0449752 W
+    Subthreshold Leakage = 0.00397708 W
+    Gate Leakage = 0.00655632 W
+    Runtime Dynamic = 1.80289 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS low operating power device type
+    Area = 0.0162858 mm^2
+    Peak Dynamic = 0.0187629 W
+    Subthreshold Leakage = 0.000240784 W
+    Gate Leakage = 0.000341076 W
+    Runtime Dynamic = 0.0938146 W
+
+  Total MCs: 1 Memory Controllers 
+  Device Type= ITRS low operating power device type
+    Area = 0.554183 mm^2
+    Peak Dynamic = 0.31033 W
+    Subthreshold Leakage = 0.0020922 W
+    Gate Leakage = 0.00751531 W
+    Runtime Dynamic = 2.21514 W
+
+  Total Flash/SSD Controllers: 1 Flash/SSD Controllers 
+  Device Type= ITRS low operating power device type
+    Area = 0.109065 mm^2
+    Peak Dynamic = 0.0299827 W
+    Subthreshold Leakage = 0.000522213 W
+    Gate Leakage = 0.0020015 W
+    Runtime Dynamic = 0.0209879 W
+
+  Total NIUs: 1 Network Interface Units 
+  Device Type= ITRS low operating power device type
+    Area = 0.261302 mm^2
+    Peak Dynamic = 0.164859 W
+    Subthreshold Leakage = 0.000730171 W
+    Gate Leakage = 0.00279855 W
+    Runtime Dynamic = 0.115402 W
+
+  Total PCIes: 1 PCIe Controllers 
+  Device Type= ITRS low operating power device type
+    Area = 0.292355 mm^2
+    Peak Dynamic = 0.215383 W
+    Subthreshold Leakage = 0.00135405 W
+    Gate Leakage = 0.00518971 W
+    Runtime Dynamic = 0.150768 W
+
+*****************************************************************************************
+Core:
+      Area = 2.66742 mm^2
+      Peak Dynamic = 0.539116 W
+      Subthreshold Leakage = 0.041382 W
+      Gate Leakage = 0.0443657 W
+      Runtime Dynamic = 0.975395 W
+
+      Instruction Fetch Unit:
+        Area = 0.565848 mm^2
+        Peak Dynamic = 0.184724 W
+        Subthreshold Leakage = 0.00572394 W
+        Gate Leakage = 0.00380598 W
+        Runtime Dynamic = 0.283222 W
+
+          Instruction Cache:
+            Area = 0.235613 mm^2
+            Peak Dynamic = 0.0310428 W
+            Subthreshold Leakage = 0.00309635 W
+            Gate Leakage = 0.00216385 W
+            Runtime Dynamic = 0.0461626 W
+
+          Branch Target Buffer:
+            Area = 0.251259 mm^2
+            Peak Dynamic = 0.0174433 W
+            Subthreshold Leakage = 0.00170231 W
+            Gate Leakage = 0.000908123 W
+            Runtime Dynamic = 0.0697733 W
+
+          Branch Predictor:
+            Area = 0.064441 mm^2
+            Peak Dynamic = 0.00815792 W
+            Subthreshold Leakage = 0.00070444 W
+            Gate Leakage = 0.000477387 W
+            Runtime Dynamic = 0.0113878 W
+
+              Global Predictor:
+                Area = 0.0313969 mm^2
+                Peak Dynamic = 0.00374527 W
+                Subthreshold Leakage = 0.00034631 W
+                Gate Leakage = 0.000233555 W
+                Runtime Dynamic = 0.00545806 W
+
+              Local Predictor:
+                Area = 0.000711939 mm^2
+                Peak Dynamic = 0.000301014 W
+                Subthreshold Leakage = 6.13457e-06 W
+                Gate Leakage = 5.63471e-06 W
+                Runtime Dynamic = 0.000471566 W
+
+                Area = 0.000650815 mm^2
+                Peak Dynamic = 0.000230123 W
+                Subthreshold Leakage = 5.7769e-06 W
+                Gate Leakage = 4.75075e-06 W
+                Runtime Dynamic = 0.000354988 W
+
+              Chooser:
+                Area = 0.0313969 mm^2
+                Peak Dynamic = 0.00374527 W
+                Subthreshold Leakage = 0.00034631 W
+                Gate Leakage = 0.000233555 W
+                Runtime Dynamic = 0.00545806 W
+
+              RAS:
+                Area = 0.000996272 mm^2
+                Peak Dynamic = 0.000366372 W
+                Subthreshold Leakage = 5.68653e-06 W
+                Gate Leakage = 4.64147e-06 W
+                Runtime Dynamic = 6.23994e-08 W
+
+          Instruction Buffer:
+            Area = 0.00820192 mm^2
+            Peak Dynamic = 0.0669878 W
+            Subthreshold Leakage = 6.33536e-05 W
+            Gate Leakage = 4.34841e-05 W
+            Runtime Dynamic = 0.0382787 W
+
+          Instruction Decoder:
+            Area = 0.00468731 mm^2
+            Peak Dynamic = 0.05881 W
+            Subthreshold Leakage = 0.000127696 W
+            Gate Leakage = 0.000115494 W
+            Runtime Dynamic = 0.11762 W
+
+      Renaming Unit:
+        Area = 0.0903068 mm^2
+        Peak Dynamic = 0.0451514 W
+        Subthreshold Leakage = 0.000345688 W
+        Gate Leakage = 0.00032022 W
+        Runtime Dynamic = 0.0731287 W
+
+          Int Front End RAT:
+            Area = 0.0543672 mm^2
+            Peak Dynamic = 0.0237617 W
+            Subthreshold Leakage = 0.000175223 W
+            Gate Leakage = 0.000121525 W
+            Runtime Dynamic = 0.0475234 W
+
+          FP Front End RAT:
+            Area = 0.0185325 mm^2
+            Peak Dynamic = 0.00949419 W
+            Subthreshold Leakage = 0.000100325 W
+            Gate Leakage = 6.76251e-05 W
+            Runtime Dynamic = 0.00949419 W
+
+          Free List:
+            Area = 0.00599955 mm^2
+            Peak Dynamic = 0.00225065 W
+            Subthreshold Leakage = 1.24363e-05 W
+            Gate Leakage = 1.00844e-05 W
+            Runtime Dynamic = 0.0090026 W
+
+          Int Retire RAT: 
+            Area = 0.00605969 mm^2
+            Peak Dynamic = 0.00448392 W
+            Subthreshold Leakage = 1.33231e-05 W
+            Gate Leakage = 1.16235e-05 W
+            Runtime Dynamic = 0.00448392 W
+
+          FP Retire RAT:
+            Area = 0.000650815 mm^2
+            Peak Dynamic = 0.00067334 W
+            Subthreshold Leakage = 5.7769e-06 W
+            Gate Leakage = 4.75075e-06 W
+            Runtime Dynamic = 0.00067334 W
+
+          FP Free List:
+            Area = 0.00305098 mm^2
+            Peak Dynamic = 0.00195124 W
+            Subthreshold Leakage = 8.81712e-06 W
+            Gate Leakage = 6.96054e-06 W
+            Runtime Dynamic = 0.00195124 W
+
+      Load Store Unit:
+        Area = 0.274913 mm^2
+        Peak Dynamic = 0.0347482 W
+        Subthreshold Leakage = 0.0032012 W
+        Gate Leakage = 0.00235752 W
+        Runtime Dynamic = 0.195304 W
+
+          Data Cache:
+            Area = 0.240878 mm^2
+            Peak Dynamic = 0.0293665 W
+            Subthreshold Leakage = 0.00312878 W
+            Gate Leakage = 0.00220794 W
+            Runtime Dynamic = 0.19026 W
+
+          StoreQ:
+            Area = 0.00754674 mm^2
+            Peak Dynamic = 0.00358087 W
+            Subthreshold Leakage = 4.2633e-05 W
+            Gate Leakage = 5.19212e-05 W
+            Runtime Dynamic = 0.00504348 W
+
+      Memory Management Unit:
+        Area = 0.021508 mm^2
+        Peak Dynamic = 0.0127337 W
+        Subthreshold Leakage = 0.000210621 W
+        Gate Leakage = 0.000290666 W
+        Runtime Dynamic = 0.037071 W
+
+          Itlb:
+            Area = 0.00993091 mm^2
+            Peak Dynamic = 0.00617846 W
+            Subthreshold Leakage = 9.04168e-05 W
+            Gate Leakage = 9.65082e-05 W
+            Runtime Dynamic = 0.012357 W
+
+          Dtlb:
+            Area = 0.00993091 mm^2
+            Peak Dynamic = 0.00438671 W
+            Subthreshold Leakage = 9.04168e-05 W
+            Gate Leakage = 9.65082e-05 W
+            Runtime Dynamic = 0.0247139 W
+
+      Execution Unit:
+        Area = 1.65498 mm^2
+        Peak Dynamic = 0.261758 W
+        Subthreshold Leakage = 0.0305522 W
+        Gate Leakage = 0.0360036 W
+        Runtime Dynamic = 0.386669 W
+
+          Register Files:
+            Area = 0.203203 mm^2
+            Peak Dynamic = 0.0763282 W
+            Subthreshold Leakage = 0.000197046 W
+            Gate Leakage = 0.00016338 W
+            Runtime Dynamic = 0.0386066 W
+
+              Integer RF:
+                Area = 0.146073 mm^2
+                Peak Dynamic = 0.0763282 W
+                Subthreshold Leakage = 0.000120303 W
+                Gate Leakage = 9.97867e-05 W
+                Runtime Dynamic = 0.0345689 W
+
+              Floating Point RF:
+                Area = 0.05713 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 7.67427e-05 W
+                Gate Leakage = 6.35938e-05 W
+                Runtime Dynamic = 0.00403765 W
+
+          Instruction Scheduler:
+            Area = 0.0582889 mm^2
+            Peak Dynamic = 0.0522571 W
+            Subthreshold Leakage = 0.000128698 W
+            Gate Leakage = 0.000185714 W
+            Runtime Dynamic = 0.0787473 W
+
+              Instruction Window:
+                Area = 0.053925 mm^2
+                Peak Dynamic = 0.0445895 W
+                Subthreshold Leakage = 9.52936e-05 W
+                Gate Leakage = 0.000130718 W
+                Runtime Dynamic = 0.0602231 W
+
+              FP Instruction Window:
+                Area = 0.00436388 mm^2
+                Peak Dynamic = 0.00766759 W
+                Subthreshold Leakage = 3.34043e-05 W
+                Gate Leakage = 5.49962e-05 W
+                Runtime Dynamic = 0.0185242 W
+
+          Integer ALUs (Count: 3 ):
+            Area = 0.312404 mm^2
+            Peak Dynamic = 0.0283684 W
+            Subthreshold Leakage = 0.0140724 W
+            Gate Leakage = 0.0165703 W
+            Runtime Dynamic = 0.0373268 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 0.971259 mm^2
+            Peak Dynamic = 0 W
+            Subthreshold Leakage = 0.0109377 W
+            Gate Leakage = 0.0128792 W
+            Runtime Dynamic = 0.0373268 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.104135 mm^2
+            Peak Dynamic = 0.0204053 W
+            Subthreshold Leakage = 0.00469079 W
+            Gate Leakage = 0.00552345 W
+            Runtime Dynamic = 0.049769 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00404385 mm^2
+            Peak Dynamic = 0.0824719 W
+            Subthreshold Leakage = 0.000495836 W
+            Gate Leakage = 0.000583852 W
+            Runtime Dynamic = 0.144892 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 0.244856 mm^2
+      Peak Dynamic = 0.0224876 W
+      Subthreshold Leakage = 0.00198854 W
+      Gate Leakage = 0.00327816 W
+      Runtime Dynamic = 1.80289 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 0.554183 mm^2
+      Peak Dynamic = 0.31033 W
+      Subthreshold Leakage = 0.0020922 W
+      Gate Leakage = 0.00751531 W
+      Runtime Dynamic = 2.21514 W
+
+      Front End Engine:
+        Area = 0.111447 mm^2
+        Peak Dynamic = 0.0117646 W
+        Subthreshold Leakage = 0.000188068 W
+        Gate Leakage = 0.000217277 W
+        Runtime Dynamic = 0.0796061 W
+
+      Transaction Engine:
+        Area = 0.113609 mm^2
+        Peak Dynamic = 0.160252 W
+        Subthreshold Leakage = 0.000380826 W
+        Gate Leakage = 0.00145961 W
+        Runtime Dynamic = 1.08436 W
+
+      PHY:
+        Area = 0.329127 mm^2
+        Peak Dynamic = 0.138314 W
+        Subthreshold Leakage = 0.00152331 W
+        Gate Leakage = 0.00583843 W
+        Runtime Dynamic = 1.05117 W
+
+*****************************************************************************************
+Flash Controller:
+      Area = 0.109065 mm^2
+      Peak Dynamic = 0.0299827 W
+      Subthreshold Leakage = 0.000522213 W
+      Gate Leakage = 0.0020015 W
+      Runtime Dynamic = 0.0209879 W
+
+*****************************************************************************************
+NIU:
+      Area = 0.261302 mm^2
+      Peak Dynamic = 0.164859 W
+      Subthreshold Leakage = 0.000730171 W
+      Gate Leakage = 0.00279855 W
+      Runtime Dynamic = 0.115402 W
+
+*****************************************************************************************
+PCIe:
+      Area = 0.292355 mm^2
+      Peak Dynamic = 0.215383 W
+      Subthreshold Leakage = 0.00135405 W
+      Gate Leakage = 0.00518971 W
+      Runtime Dynamic = 0.150768 W
+
+*****************************************************************************************
+BUSES
+      Area = 0.0162858 mm^2
+      Peak Dynamic = 0.0187629 W
+      Subthreshold Leakage = 0.000240784 W
+      Gate Leakage = 0.000341076 W
+      Runtime Dynamic = 0.0938146 W
+
+      Bus: 
+        Area = 0.0162858 mm^2
+        Peak Dynamic = 0.0187629 W
+        Subthreshold Leakage = 0.000240784 W
+        Gate Leakage = 0.000341076 W
+        Runtime Dynamic = 0.0938146 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/A9_800 b/ext/mcpat/results/A9_800
new file mode 100644 (file)
index 0000000..e8f3301
--- /dev/null
@@ -0,0 +1,320 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 40 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= conservative interconnect technology projection
+  Core clock Rate(MHz) 800
+
+*****************************************************************************************
+Processor: 
+  Area = 5.48929 mm^2
+  Peak Power = 0.577263 W
+  Total Leakage = 0.127046 W
+  Peak Dynamic = 0.450217 W
+  Subthreshold Leakage = 0.0608257 W
+  Gate Leakage = 0.0662198 W
+  Runtime Dynamic = 1.13304 W
+
+  Total Cores: 
+  Device Type= ITRS low operating power device type
+    Area = 4.98521 mm^2
+    Peak Dynamic = 0.425609 W
+    Subthreshold Leakage = 0.0577408 W
+    Gate Leakage = 0.061241 W
+    Runtime Dynamic = 0.37879 W
+
+  Total First Level Directory: 
+  Device Type= ITRS low operating power device type
+    Area = 0.489711 mm^2
+    Peak Dynamic = 0.0179901 W
+    Subthreshold Leakage = 0.0029286 W
+    Gate Leakage = 0.00476045 W
+    Runtime Dynamic = 0.721156 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS low operating power device type
+    Area = 0.0143604 mm^2
+    Peak Dynamic = 0.00661787 W
+    Subthreshold Leakage = 0.000156344 W
+    Gate Leakage = 0.000218372 W
+    Runtime Dynamic = 0.0330893 W
+
+*****************************************************************************************
+Core:
+      Area = 2.49261 mm^2
+      Peak Dynamic = 0.212805 W
+      Subthreshold Leakage = 0.0288704 W
+      Gate Leakage = 0.0306205 W
+      Runtime Dynamic = 0.37879 W
+
+      Instruction Fetch Unit:
+        Area = 0.450898 mm^2
+        Peak Dynamic = 0.0710479 W
+        Subthreshold Leakage = 0.00360576 W
+        Gate Leakage = 0.00232348 W
+        Runtime Dynamic = 0.101921 W
+
+          Instruction Cache:
+            Area = 0.235613 mm^2
+            Peak Dynamic = 0.0124171 W
+            Subthreshold Leakage = 0.00228006 W
+            Gate Leakage = 0.00157114 W
+            Runtime Dynamic = 0.018465 W
+
+          Branch Target Buffer:
+            Area = 0.136309 mm^2
+            Peak Dynamic = 0.00413545 W
+            Subthreshold Leakage = 0.000644359 W
+            Gate Leakage = 0.000219381 W
+            Runtime Dynamic = 0.0165418 W
+
+          Branch Predictor:
+            Area = 0.064441 mm^2
+            Peak Dynamic = 0.00326317 W
+            Subthreshold Leakage = 0.000518728 W
+            Gate Leakage = 0.000346624 W
+            Runtime Dynamic = 0.0045551 W
+
+              Global Predictor:
+                Area = 0.0313969 mm^2
+                Peak Dynamic = 0.00149811 W
+                Subthreshold Leakage = 0.000255012 W
+                Gate Leakage = 0.000169581 W
+                Runtime Dynamic = 0.00218323 W
+
+              Local Predictor:
+                Area = 0.000711939 mm^2
+                Peak Dynamic = 0.000120406 W
+                Subthreshold Leakage = 4.51731e-06 W
+                Gate Leakage = 4.09128e-06 W
+                Runtime Dynamic = 0.000188626 W
+
+                Area = 0.000650815 mm^2
+                Peak Dynamic = 9.20494e-05 W
+                Subthreshold Leakage = 4.25393e-06 W
+                Gate Leakage = 3.44945e-06 W
+                Runtime Dynamic = 0.000141995 W
+
+              Chooser:
+                Area = 0.0313969 mm^2
+                Peak Dynamic = 0.00149811 W
+                Subthreshold Leakage = 0.000255012 W
+                Gate Leakage = 0.000169581 W
+                Runtime Dynamic = 0.00218323 W
+
+              RAS:
+                Area = 0.000996272 mm^2
+                Peak Dynamic = 0.000146549 W
+                Subthreshold Leakage = 4.18739e-06 W
+                Gate Leakage = 3.3701e-06 W
+                Runtime Dynamic = 2.49598e-08 W
+
+          Instruction Buffer:
+            Area = 0.00820192 mm^2
+            Peak Dynamic = 0.0267951 W
+            Subthreshold Leakage = 4.66516e-05 W
+            Gate Leakage = 3.15732e-05 W
+            Runtime Dynamic = 0.0153115 W
+
+          Instruction Decoder:
+            Area = 0.00468731 mm^2
+            Peak Dynamic = 0.023524 W
+            Subthreshold Leakage = 9.40317e-05 W
+            Gate Leakage = 8.38587e-05 W
+            Runtime Dynamic = 0.047048 W
+
+      Renaming Unit:
+        Area = 0.0903068 mm^2
+        Peak Dynamic = 0.0180606 W
+        Subthreshold Leakage = 0.000254554 W
+        Gate Leakage = 0.000232507 W
+        Runtime Dynamic = 0.0292515 W
+
+          Int Front End RAT:
+            Area = 0.0543672 mm^2
+            Peak Dynamic = 0.00950468 W
+            Subthreshold Leakage = 0.000129029 W
+            Gate Leakage = 8.82378e-05 W
+            Runtime Dynamic = 0.0190094 W
+
+          FP Front End RAT:
+            Area = 0.0185325 mm^2
+            Peak Dynamic = 0.00379768 W
+            Subthreshold Leakage = 7.38761e-05 W
+            Gate Leakage = 4.91016e-05 W
+            Runtime Dynamic = 0.00379768 W
+
+          Free List:
+            Area = 0.00599955 mm^2
+            Peak Dynamic = 0.00090026 W
+            Subthreshold Leakage = 9.15772e-06 W
+            Gate Leakage = 7.32213e-06 W
+            Runtime Dynamic = 0.00360104 W
+
+          Int Retire RAT: 
+            Area = 0.00605969 mm^2
+            Peak Dynamic = 0.00179357 W
+            Subthreshold Leakage = 9.8107e-06 W
+            Gate Leakage = 8.43969e-06 W
+            Runtime Dynamic = 0.00179357 W
+
+          FP Retire RAT:
+            Area = 0.000650815 mm^2
+            Peak Dynamic = 0.000269336 W
+            Subthreshold Leakage = 4.25393e-06 W
+            Gate Leakage = 3.44945e-06 W
+            Runtime Dynamic = 0.000269336 W
+
+          FP Free List:
+            Area = 0.00305098 mm^2
+            Peak Dynamic = 0.000780497 W
+            Subthreshold Leakage = 6.49266e-06 W
+            Gate Leakage = 5.05395e-06 W
+            Runtime Dynamic = 0.000780497 W
+
+      Load Store Unit:
+        Area = 0.274913 mm^2
+        Peak Dynamic = 0.0138993 W
+        Subthreshold Leakage = 0.00235727 W
+        Gate Leakage = 0.00171176 W
+        Runtime Dynamic = 0.0781216 W
+
+          Data Cache:
+            Area = 0.240878 mm^2
+            Peak Dynamic = 0.0117466 W
+            Subthreshold Leakage = 0.00230394 W
+            Gate Leakage = 0.00160316 W
+            Runtime Dynamic = 0.0761042 W
+
+          StoreQ:
+            Area = 0.00754674 mm^2
+            Peak Dynamic = 0.00143235 W
+            Subthreshold Leakage = 3.13936e-05 W
+            Gate Leakage = 3.76992e-05 W
+            Runtime Dynamic = 0.00201739 W
+
+      Memory Management Unit:
+        Area = 0.021508 mm^2
+        Peak Dynamic = 0.0050935 W
+        Subthreshold Leakage = 0.000155095 W
+        Gate Leakage = 0.000211049 W
+        Runtime Dynamic = 0.0148284 W
+
+          Itlb:
+            Area = 0.00993091 mm^2
+            Peak Dynamic = 0.00247139 W
+            Subthreshold Leakage = 6.65801e-05 W
+            Gate Leakage = 7.00732e-05 W
+            Runtime Dynamic = 0.0049428 W
+
+          Dtlb:
+            Area = 0.00993091 mm^2
+            Peak Dynamic = 0.00175468 W
+            Subthreshold Leakage = 6.65801e-05 W
+            Gate Leakage = 7.00732e-05 W
+            Runtime Dynamic = 0.00988557 W
+
+      Execution Unit:
+        Area = 1.65498 mm^2
+        Peak Dynamic = 0.104703 W
+        Subthreshold Leakage = 0.0224977 W
+        Gate Leakage = 0.0261417 W
+        Runtime Dynamic = 0.154667 W
+
+          Register Files:
+            Area = 0.203203 mm^2
+            Peak Dynamic = 0.0305313 W
+            Subthreshold Leakage = 0.000145099 W
+            Gate Leakage = 0.000118628 W
+            Runtime Dynamic = 0.0154426 W
+
+              Integer RF:
+                Area = 0.146073 mm^2
+                Peak Dynamic = 0.0305313 W
+                Subthreshold Leakage = 8.85877e-05 W
+                Gate Leakage = 7.24537e-05 W
+                Runtime Dynamic = 0.0138276 W
+
+              Floating Point RF:
+                Area = 0.05713 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 5.6511e-05 W
+                Gate Leakage = 4.61745e-05 W
+                Runtime Dynamic = 0.00161506 W
+
+          Instruction Scheduler:
+            Area = 0.0582889 mm^2
+            Peak Dynamic = 0.0209028 W
+            Subthreshold Leakage = 9.47693e-05 W
+            Gate Leakage = 0.000134844 W
+            Runtime Dynamic = 0.0314989 W
+
+              Instruction Window:
+                Area = 0.053925 mm^2
+                Peak Dynamic = 0.0178358 W
+                Subthreshold Leakage = 7.01713e-05 W
+                Gate Leakage = 9.49122e-05 W
+                Runtime Dynamic = 0.0240893 W
+
+              FP Instruction Window:
+                Area = 0.00436388 mm^2
+                Peak Dynamic = 0.00306704 W
+                Subthreshold Leakage = 2.45979e-05 W
+                Gate Leakage = 3.99319e-05 W
+                Runtime Dynamic = 0.00740966 W
+
+          Integer ALUs (Count: 3 ):
+            Area = 0.312404 mm^2
+            Peak Dynamic = 0.0113473 W
+            Subthreshold Leakage = 0.0103625 W
+            Gate Leakage = 0.0120315 W
+            Runtime Dynamic = 0.0149307 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 0.971259 mm^2
+            Peak Dynamic = 0 W
+            Subthreshold Leakage = 0.00805417 W
+            Gate Leakage = 0.00935142 W
+            Runtime Dynamic = 0.0149307 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.104135 mm^2
+            Peak Dynamic = 0.00816212 W
+            Subthreshold Leakage = 0.00345415 W
+            Gate Leakage = 0.0040105 W
+            Runtime Dynamic = 0.0199076 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00404385 mm^2
+            Peak Dynamic = 0.0329888 W
+            Subthreshold Leakage = 0.000365119 W
+            Gate Leakage = 0.000423926 W
+            Runtime Dynamic = 0.0579569 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 0.244856 mm^2
+      Peak Dynamic = 0.00899504 W
+      Subthreshold Leakage = 0.0014643 W
+      Gate Leakage = 0.00238022 W
+      Runtime Dynamic = 0.721156 W
+
+*****************************************************************************************
+BUSES
+      Area = 0.0143604 mm^2
+      Peak Dynamic = 0.00661787 W
+      Subthreshold Leakage = 0.000156344 W
+      Gate Leakage = 0.000218372 W
+      Runtime Dynamic = 0.0330893 W
+
+      Bus: 
+        Area = 0.0143604 mm^2
+        Peak Dynamic = 0.00661787 W
+        Subthreshold Leakage = 0.000156344 W
+        Gate Leakage = 0.000218372 W
+        Runtime Dynamic = 0.0330893 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/Alpha21364 b/ext/mcpat/results/Alpha21364
new file mode 100644 (file)
index 0000000..1b3d9e4
--- /dev/null
@@ -0,0 +1,441 @@
+McPAT (version 0.7 of May, 2010) is computing the target processor...
+Warning: icache array structure cannot satisfy throughput constraint.
+Warning: icache array structure cannot satisfy latency constraint.
+Warning: InstBuffer array structure cannot satisfy throughput constraint.
+Warning: InstBuffer array structure cannot satisfy latency constraint.
+Warning: Branch Target Buffer array structure cannot satisfy throughput constraint.
+Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
+Warning: Global Predictor array structure cannot satisfy throughput constraint.
+Warning: Global Predictor array structure cannot satisfy latency constraint.
+Warning: L1 local Predictor array structure cannot satisfy throughput constraint.
+Warning: L1 local Predictor array structure cannot satisfy latency constraint.
+Warning: L2 local Predictor array structure cannot satisfy throughput constraint.
+Warning: L2 local Predictor array structure cannot satisfy latency constraint.
+Warning: Predictor Chooser array structure cannot satisfy throughput constraint.
+Warning: Predictor Chooser array structure cannot satisfy latency constraint.
+Warning: RAS array structure cannot satisfy throughput constraint.
+Warning: RAS array structure cannot satisfy latency constraint.
+Warning: dcache array structure cannot satisfy throughput constraint.
+Warning: dcache array structure cannot satisfy latency constraint.
+Warning: Integer Register File array structure cannot satisfy throughput constraint.
+Warning: Integer Register File array structure cannot satisfy latency constraint.
+Warning: Floating point Register File array structure cannot satisfy throughput constraint.
+Warning: Floating point Register File array structure cannot satisfy latency constraint.
+Warning: ReorderBuffer array structure cannot satisfy throughput constraint.
+Warning: ReorderBuffer array structure cannot satisfy latency constraint.
+Warning: Int RetireRAT array structure cannot satisfy throughput constraint.
+Warning: Int RetireRAT array structure cannot satisfy latency constraint.
+Warning: Int RetireRAT array structure cannot satisfy latency constraint.
+Warning: Int Free List array structure cannot satisfy throughput constraint.
+Warning: Int Free List array structure cannot satisfy latency constraint.
+Warning: Int Free List array structure cannot satisfy throughput constraint.
+Warning: Int Free List array structure cannot satisfy latency constraint.
+Warning: MC ReadBuffer array structure cannot satisfy throughput constraint.
+Warning: MC ReadBuffer array structure cannot satisfy latency constraint.
+Warning: MC writeBuffer array structure cannot satisfy throughput constraint.
+Warning: MC writeBuffer array structure cannot satisfy latency constraint.
+
+McPAT (version 0.7 of May, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 180 nm
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1200
+
+*****************************************************************************************
+Processor: 
+  Area = 323.859 mm^2
+  Peak Power = 90.0375 W
+  Total Leakage = 0.156795 W
+  Peak Dynamic = 89.8807 W
+  Subthreshold Leakage = 0.151936 W
+  Gate Leakage = 0.00485969 W
+  Runtime Dynamic = 85.2036 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 137.839 mm^2
+    Peak Dynamic = 60.6776 W
+    Subthreshold Leakage = 0.067186 W
+    Gate Leakage = 0.00428355 W
+    Runtime Dynamic = 73.9555 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 137.063 mm^2
+    Peak Dynamic = 3.55835 W
+    Subthreshold Leakage = 0.0778886 W
+    Gate Leakage = 0.00016078 W
+    Runtime Dynamic = 6.34872 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 1.59954 mm^2
+    Peak Dynamic = 0.805902 W
+    Subthreshold Leakage = 0.000311783 W
+    Gate Leakage = 2.63568e-05 W
+    Runtime Dynamic = 0.547665 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 29.1057 mm^2
+    Peak Dynamic = 16.5188 W
+    Subthreshold Leakage = 0.00292556 W
+    Gate Leakage = 0.000166293 W
+    Runtime Dynamic = 2.54446 W
+
+  Total MCs: 
+  Device Type= ITRS high performance device type
+    Area = 18.2519 mm^2
+    Peak Dynamic = 8.32001 W
+    Subthreshold Leakage = 0.00362353 W
+    Gate Leakage = 0.000222708 W
+    Runtime Dynamic = 1.80731 W
+
+*****************************************************************************************
+Core:
+      Area = 137.839 mm^2
+      Peak Dynamic = 60.6776 W
+      Subthreshold Leakage = 0.067186 W
+      Gate Leakage = 0.00428355 W
+      Runtime Dynamic = 73.9555 W
+
+      Instruction Fetch Unit:
+        Area = 27.6096 mm^2
+        Peak Dynamic = 9.86655 W
+        Subthreshold Leakage = 0.00622106 W
+        Gate Leakage = 0.000344671 W
+        Runtime Dynamic = 10.0567 W
+
+          Instruction Cache:
+            Area = 11.4511 mm^2
+            Peak Dynamic = 1.53259 W
+            Subthreshold Leakage = 0.00371341 W
+            Gate Leakage = 0.000171069 W
+            Runtime Dynamic = 2.13168 W
+
+          Branch Target Buffer:
+            Area = 13.3377 mm^2
+            Peak Dynamic = 0.56236 W
+            Subthreshold Leakage = 0.001581 W
+            Gate Leakage = 9.5198e-05 W
+            Runtime Dynamic = 2.24944 W
+
+          Branch Predictor:
+            Area = 2.1618 mm^2
+            Peak Dynamic = 0.234643 W
+            Subthreshold Leakage = 0.000469396 W
+            Gate Leakage = 2.01907e-05 W
+            Runtime Dynamic = 0.198646 W
+
+              Global Predictor:
+                Area = 0.893575 mm^2
+                Peak Dynamic = 0.0726984 W
+                Subthreshold Leakage = 0.000182866 W
+                Gate Leakage = 7.91951e-06 W
+                Runtime Dynamic = 0.0726984 W
+
+              Local Predictor:
+                Area = 0.420241 mm^2
+                Peak Dynamic = 0.0532456 W
+                Subthreshold Leakage = 9.20027e-05 W
+                Gate Leakage = 3.89162e-06 W
+                Runtime Dynamic = 0.0532456 W
+
+                Area = 0.291886 mm^2
+                Peak Dynamic = 0.0292091 W
+                Subthreshold Leakage = 5.262e-05 W
+                Gate Leakage = 2.51093e-06 W
+                Runtime Dynamic = 0.0292091 W
+
+              Chooser:
+                Area = 0.893575 mm^2
+                Peak Dynamic = 0.0726984 W
+                Subthreshold Leakage = 0.000182866 W
+                Gate Leakage = 7.91951e-06 W
+                Runtime Dynamic = 0.0726984 W
+
+              RAS:
+                Area = 0.0827607 mm^2
+                Peak Dynamic = 0.0360009 W
+                Subthreshold Leakage = 1.16623e-05 W
+                Gate Leakage = 4.60036e-07 W
+                Runtime Dynamic = 3.58028e-06 W
+
+          Instruction Buffer:
+            Area = 0.465385 mm^2
+            Peak Dynamic = 2.10455 W
+            Subthreshold Leakage = 6.13248e-05 W
+            Gate Leakage = 4.88113e-06 W
+            Runtime Dynamic = 1.40303 W
+
+          Instruction Decoder:
+            Area = 0.146031 mm^2
+            Peak Dynamic = 4.07384 W
+            Subthreshold Leakage = 7.07416e-05 W
+            Gate Leakage = 3.32268e-06 W
+            Runtime Dynamic = 4.07384 W
+
+      Renaming Unit:
+        Area = 11.7262 mm^2
+        Peak Dynamic = 12.5584 W
+        Subthreshold Leakage = 0.000886804 W
+        Gate Leakage = 9.92419e-05 W
+        Runtime Dynamic = 9.90647 W
+
+          Int Front End RAT:
+            Area = 8.24345 mm^2
+            Peak Dynamic = 8.04227 W
+            Subthreshold Leakage = 0.000376247 W
+            Gate Leakage = 3.40623e-05 W
+            Runtime Dynamic = 8.04227 W
+
+          FP Front End RAT:
+            Area = 2.549 mm^2
+            Peak Dynamic = 2.75082 W
+            Subthreshold Leakage = 0.000149367 W
+            Gate Leakage = 1.30084e-05 W
+            Runtime Dynamic = 1.37541 W
+
+          Free List:
+            Area = 0.446019 mm^2
+            Peak Dynamic = 0.156051 W
+            Subthreshold Leakage = 1.32133e-05 W
+            Gate Leakage = 7.4667e-07 W
+            Runtime Dynamic = 0.312102 W
+
+          Int Retire RAT: 
+            Area = 0.184445 mm^2
+            Peak Dynamic = 0.102656 W
+            Subthreshold Leakage = 8.50239e-06 W
+            Gate Leakage = 5.28869e-07 W
+            Runtime Dynamic = 0.102656 W
+
+          FP Retire RAT:
+            Area = 0.0567228 mm^2
+            Peak Dynamic = 0.0367258 W
+            Subthreshold Leakage = 5.67894e-06 W
+            Gate Leakage = 3.75578e-07 W
+            Runtime Dynamic = 0.0183629 W
+
+          FP Free List:
+            Area = 0.198929 mm^2
+            Peak Dynamic = 0.111293 W
+            Subthreshold Leakage = 8.61952e-06 W
+            Gate Leakage = 5.10875e-07 W
+            Runtime Dynamic = 0.0556467 W
+
+      Load Store Unit:
+        Area = 49.742 mm^2
+        Peak Dynamic = 11.7952 W
+        Subthreshold Leakage = 0.00715349 W
+        Gate Leakage = 0.00052778 W
+        Runtime Dynamic = 31.7658 W
+
+          Data Cache:
+            Area = 36.106 mm^2
+            Peak Dynamic = 9.28008 W
+            Subthreshold Leakage = 0.00663485 W
+            Gate Leakage = 0.000466572 W
+            Runtime Dynamic = 31.332 W
+
+          LoadQ:
+            Area = 2.60005 mm^2
+            Peak Dynamic = 0.578279 W
+            Subthreshold Leakage = 9.67302e-05 W
+            Gate Leakage = 5.59905e-06 W
+            Runtime Dynamic = 0.14457 W
+
+          StoreQ:
+            Area = 2.60005 mm^2
+            Peak Dynamic = 0.578279 W
+            Subthreshold Leakage = 9.67302e-05 W
+            Gate Leakage = 5.59905e-06 W
+            Runtime Dynamic = 0.289139 W
+
+      Memory Management Unit:
+        Area = 8.74543 mm^2
+        Peak Dynamic = 3.77198 W
+        Subthreshold Leakage = 0.00119904 W
+        Gate Leakage = 0.000127183 W
+        Runtime Dynamic = 4.82688 W
+
+          Itlb:
+            Area = 1.97969 mm^2
+            Peak Dynamic = 0.537563 W
+            Subthreshold Leakage = 0.000270576 W
+            Gate Leakage = 2.0845e-05 W
+            Runtime Dynamic = 1.07513 W
+
+          Dtlb:
+            Area = 6.71814 mm^2
+            Peak Dynamic = 1.87586 W
+            Subthreshold Leakage = 0.00060329 W
+            Gate Leakage = 5.63286e-05 W
+            Runtime Dynamic = 3.75174 W
+
+      Execution Unit:
+        Area = 31.4918 mm^2
+        Peak Dynamic = 22.6855 W
+        Subthreshold Leakage = 0.0320294 W
+        Gate Leakage = 0.00198102 W
+        Runtime Dynamic = 17.3997 W
+
+          Register Files:
+            Area = 9.9318 mm^2
+            Peak Dynamic = 3.92301 W
+            Subthreshold Leakage = 0.000295352 W
+            Gate Leakage = 1.33517e-05 W
+            Runtime Dynamic = 1.7929 W
+
+              Integer RF:
+                Area = 6.76678 mm^2
+                Peak Dynamic = 2.35597 W
+                Subthreshold Leakage = 0.000185762 W
+                Gate Leakage = 8.51701e-06 W
+                Runtime Dynamic = 1.60634 W
+
+              Floating Point RF:
+                Area = 3.16503 mm^2
+                Peak Dynamic = 1.56704 W
+                Subthreshold Leakage = 0.00010959 W
+                Gate Leakage = 4.83467e-06 W
+                Runtime Dynamic = 0.186553 W
+
+          Instruction Scheduler:
+            Area = 5.20691 mm^2
+            Peak Dynamic = 2.77224 W
+            Subthreshold Leakage = 0.000202187 W
+            Gate Leakage = 1.05832e-05 W
+            Runtime Dynamic = 3.11355 W
+
+              Instruction Window:
+                Area = 1.23862 mm^2
+                Peak Dynamic = 0.985117 W
+                Subthreshold Leakage = 5.55506e-05 W
+                Gate Leakage = 3.78978e-06 W
+                Runtime Dynamic = 1.23906 W
+
+              FP Instruction Window:
+                Area = 0.481718 mm^2
+                Peak Dynamic = 0.438839 W
+                Subthreshold Leakage = 2.5962e-05 W
+                Gate Leakage = 2.00351e-06 W
+                Runtime Dynamic = 0.526208 W
+
+              ROB:
+                Area = 3.48657 mm^2
+                Peak Dynamic = 1.34828 W
+                Subthreshold Leakage = 0.000120674 W
+                Gate Leakage = 4.78991e-06 W
+                Runtime Dynamic = 1.34828 W
+
+          Integer ALUs (Count: 4 ):
+            Area = 3.4944 mm^2
+            Peak Dynamic = 4.23312 W
+            Subthreshold Leakage = 0.016149 W
+            Gate Leakage = 0.000986885 W
+            Runtime Dynamic = 3.21343 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 12.705 mm^2
+            Peak Dynamic = 3.52215 W
+            Subthreshold Leakage = 0.0146787 W
+            Gate Leakage = 0.000897034 W
+            Runtime Dynamic = 3.52215 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.106062 mm^2
+            Peak Dynamic = 6.87645 W
+            Subthreshold Leakage = 0.000378957 W
+            Gate Leakage = 2.31585e-05 W
+            Runtime Dynamic = 5.75766 W
+
+*****************************************************************************************
+L2
+      Area = 137.063 mm^2
+      Peak Dynamic = 3.55835 W
+      Subthreshold Leakage = 0.0778886 W
+      Gate Leakage = 0.00016078 W
+      Runtime Dynamic = 6.34872 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 1.59954 mm^2
+      Peak Dynamic = 0.805902 W
+      Subthreshold Leakage = 0.000311783 W
+      Gate Leakage = 2.63568e-05 W
+      Runtime Dynamic = 0.547665 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 9.12595 mm^2
+      Peak Dynamic = 4.16 W
+      Subthreshold Leakage = 0.00181177 W
+      Gate Leakage = 0.000111354 W
+      Runtime Dynamic = 1.80731 W
+
+      Front End Engine:
+        Area = 5.49326 mm^2
+        Peak Dynamic = 1.42883 W
+        Subthreshold Leakage = 0.000132955 W
+        Gate Leakage = 8.76015e-06 W
+        Runtime Dynamic = 0.348049 W
+
+      Transaction Engine:
+        Area = 1.50616 mm^2
+        Peak Dynamic = 1.93117 W
+        Subthreshold Leakage = 0.000696058 W
+        Gate Leakage = 4.25369e-05 W
+        Runtime Dynamic = 0.579332 W
+
+      PHY:
+        Area = 2.12653 mm^2
+        Peak Dynamic = 0.8 W
+        Subthreshold Leakage = 0.000982753 W
+        Gate Leakage = 6.00571e-05 W
+        Runtime Dynamic = 0.879928 W
+
+*****************************************************************************************
+NOC
+      Area = 29.1057 mm^2
+      Peak Dynamic = 16.5188 W
+      Subthreshold Leakage = 0.00292556 W
+      Gate Leakage = 0.000166293 W
+      Runtime Dynamic = 2.54446 W
+
+      Router: 
+        Area = 28.4197 mm^2
+        Peak Dynamic = 8.76431 W
+        Subthreshold Leakage = 0.00199965 W
+        Gate Leakage = 0.000109709 W
+        Runtime Dynamic = 1.25204 W
+
+            Virtual Channel Buffer:
+              Area = 17.0424 mm^2
+              Peak Dynamic = 7.30291 W
+              Subthreshold Leakage = 0.00119658 W
+              Gate Leakage = 4.15511e-05 W
+              Runtime Dynamic = 1.04327 W
+
+            Crossbar:
+              Area = 0.357655 mm^2
+              Peak Dynamic = 1.27997 W
+              Subthreshold Leakage = 0.000801415 W
+              Gate Leakage = 6.80527e-05 W
+              Runtime Dynamic = 0.182853 W
+
+            Arbiter:
+              Peak Dynamic = 0.18143 W
+              Subthreshold Leakage = 1.65956e-06 W
+              Gate Leakage = 1.05559e-07 W
+              Runtime Dynamic = 0.0259186 W
+
+      Per Router : 
+        Area = 0.685989 mm^2
+        Peak Dynamic = 7.75447 W
+        Subthreshold Leakage = 0.000925911 W
+        Gate Leakage = 5.65834e-05 W
+        Runtime Dynamic = 1.29241 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/Alpha21364_90nm b/ext/mcpat/results/Alpha21364_90nm
new file mode 100644 (file)
index 0000000..2a97d77
--- /dev/null
@@ -0,0 +1,408 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+Warning: icache array structure cannot satisfy latency constraint.
+Warning: dcache array structure cannot satisfy latency constraint.
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 90 nm
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1200
+
+*****************************************************************************************
+Processor: 
+  Area = 139.86 mm^2
+  Peak Power = 34.9936 W
+  Total Leakage = 4.16949 W
+  Peak Dynamic = 30.8241 W
+  Subthreshold Leakage = 3.86203 W
+  Gate Leakage = 0.307463 W
+  Runtime Dynamic = 34.0612 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 61.1957 mm^2
+    Peak Dynamic = 19.6269 W
+    Subthreshold Leakage = 2.04452 W
+    Gate Leakage = 0.277429 W
+    Runtime Dynamic = 29.5972 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 62.2653 mm^2
+    Peak Dynamic = 1.42987 W
+    Subthreshold Leakage = 1.65481 W
+    Gate Leakage = 0.00860545 W
+    Runtime Dynamic = 2.73329 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 0.533824 mm^2
+    Peak Dynamic = 0.275566 W
+    Subthreshold Leakage = 0.00929753 W
+    Gate Leakage = 0.00179126 W
+    Runtime Dynamic = 0.193681 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 8.77595 mm^2
+    Peak Dynamic = 6.17873 W
+    Subthreshold Leakage = 0.108357 W
+    Gate Leakage = 0.0139259 W
+    Runtime Dynamic = 0.963385 W
+
+  Total MCs: 
+  Device Type= ITRS high performance device type
+    Area = 7.08925 mm^2
+    Peak Dynamic = 3.3131 W
+    Subthreshold Leakage = 0.0450389 W
+    Gate Leakage = 0.00571171 W
+    Runtime Dynamic = 0.573656 W
+
+*****************************************************************************************
+Core:
+      Area = 61.1957 mm^2
+      Peak Dynamic = 19.6269 W
+      Subthreshold Leakage = 2.04452 W
+      Gate Leakage = 0.277429 W
+      Runtime Dynamic = 29.5972 W
+
+      Instruction Fetch Unit:
+        Area = 7.40352 mm^2
+        Peak Dynamic = 2.10646 W
+        Subthreshold Leakage = 0.126581 W
+        Gate Leakage = 0.0150397 W
+        Runtime Dynamic = 2.55478 W
+
+          Instruction Cache:
+            Area = 5.01657 mm^2
+            Peak Dynamic = 0.745807 W
+            Subthreshold Leakage = 0.0906167 W
+            Gate Leakage = 0.010922 W
+            Runtime Dynamic = 1.22193 W
+
+          Branch Target Buffer:
+            Area = 1.63475 mm^2
+            Peak Dynamic = 0.0974373 W
+            Subthreshold Leakage = 0.0188281 W
+            Gate Leakage = 0.00126965 W
+            Runtime Dynamic = 0.389749 W
+
+          Branch Predictor:
+            Area = 0.474272 mm^2
+            Peak Dynamic = 0.0682449 W
+            Subthreshold Leakage = 0.00901262 W
+            Gate Leakage = 0.00067136 W
+            Runtime Dynamic = 0.0636543 W
+
+              Global Predictor:
+                Area = 0.190297 mm^2
+                Peak Dynamic = 0.0224229 W
+                Subthreshold Leakage = 0.00351842 W
+                Gate Leakage = 0.000260107 W
+                Runtime Dynamic = 0.0239711 W
+
+              Local Predictor:
+                Area = 0.0959237 mm^2
+                Peak Dynamic = 0.0143301 W
+                Subthreshold Leakage = 0.00171829 W
+                Gate Leakage = 0.00012889 W
+                Runtime Dynamic = 0.015711 W
+
+                Area = 0.0484908 mm^2
+                Peak Dynamic = 0.0077514 W
+                Subthreshold Leakage = 0.000926283 W
+                Gate Leakage = 7.55051e-05 W
+                Runtime Dynamic = 0.00850163 W
+
+              Chooser:
+                Area = 0.190297 mm^2
+                Peak Dynamic = 0.0224229 W
+                Subthreshold Leakage = 0.00351842 W
+                Gate Leakage = 0.000260107 W
+                Runtime Dynamic = 0.0239711 W
+
+              RAS:
+                Area = 0.0451868 mm^2
+                Peak Dynamic = 0.00906891 W
+                Subthreshold Leakage = 0.00025749 W
+                Gate Leakage = 2.22565e-05 W
+                Runtime Dynamic = 1.06361e-06 W
+
+          Instruction Buffer:
+            Area = 0.11139 mm^2
+            Peak Dynamic = 0.30298 W
+            Subthreshold Leakage = 0.000556928 W
+            Gate Leakage = 4.34124e-05 W
+            Runtime Dynamic = 0.201987 W
+
+          Instruction Decoder:
+            Area = 0.0481902 mm^2
+            Peak Dynamic = 0.677465 W
+            Subthreshold Leakage = 0.00135195 W
+            Gate Leakage = 0.000132907 W
+            Runtime Dynamic = 0.677465 W
+
+      Renaming Unit:
+        Area = 4.5037 mm^2
+        Peak Dynamic = 4.11785 W
+        Subthreshold Leakage = 0.0296009 W
+        Gate Leakage = 0.00668098 W
+        Runtime Dynamic = 3.24944 W
+
+          Int Front End RAT:
+            Area = 2.76467 mm^2
+            Peak Dynamic = 2.43279 W
+            Subthreshold Leakage = 0.0129405 W
+            Gate Leakage = 0.00255854 W
+            Runtime Dynamic = 2.43279 W
+
+          FP Front End RAT:
+            Area = 1.39233 mm^2
+            Peak Dynamic = 1.35403 W
+            Subthreshold Leakage = 0.00981219 W
+            Gate Leakage = 0.00205621 W
+            Runtime Dynamic = 0.677017 W
+
+          Free List:
+            Area = 0.116928 mm^2
+            Peak Dynamic = 0.0436483 W
+            Subthreshold Leakage = 0.000259915 W
+            Gate Leakage = 2.53395e-05 W
+            Runtime Dynamic = 0.0872966 W
+
+          Int Retire RAT: 
+            Area = 0.0429772 mm^2
+            Peak Dynamic = 0.0318091 W
+            Subthreshold Leakage = 0.000152798 W
+            Gate Leakage = 1.86722e-05 W
+            Runtime Dynamic = 0.0318091 W
+
+          FP Retire RAT:
+            Area = 0.0153516 mm^2
+            Peak Dynamic = 0.00997874 W
+            Subthreshold Leakage = 8.06509e-05 W
+            Gate Leakage = 7.17049e-06 W
+            Runtime Dynamic = 0.00498937 W
+
+          FP Free List:
+            Area = 0.0530951 mm^2
+            Peak Dynamic = 0.0310624 W
+            Subthreshold Leakage = 0.000140326 W
+            Gate Leakage = 1.46766e-05 W
+            Runtime Dynamic = 0.0155312 W
+
+      Load Store Unit:
+        Area = 20.5622 mm^2
+        Peak Dynamic = 5.14439 W
+        Subthreshold Leakage = 0.207699 W
+        Gate Leakage = 0.0357344 W
+        Runtime Dynamic = 16.0217 W
+
+          Data Cache:
+            Area = 15.2468 mm^2
+            Peak Dynamic = 4.5468 W
+            Subthreshold Leakage = 0.19694 W
+            Gate Leakage = 0.0331746 W
+            Runtime Dynamic = 15.8781 W
+
+          LoadQ:
+            Area = 0.863734 mm^2
+            Peak Dynamic = 0.191536 W
+            Subthreshold Leakage = 0.00227213 W
+            Gate Leakage = 0.000279753 W
+            Runtime Dynamic = 0.047884 W
+
+          StoreQ:
+            Area = 0.863734 mm^2
+            Peak Dynamic = 0.191536 W
+            Subthreshold Leakage = 0.00227213 W
+            Gate Leakage = 0.000279753 W
+            Runtime Dynamic = 0.0957681 W
+
+      Memory Management Unit:
+        Area = 3.49533 mm^2
+        Peak Dynamic = 1.34391 W
+        Subthreshold Leakage = 0.0412098 W
+        Gate Leakage = 0.00931467 W
+        Runtime Dynamic = 2.25879 W
+
+          Itlb:
+            Area = 1.12903 mm^2
+            Peak Dynamic = 0.425717 W
+            Subthreshold Leakage = 0.0152632 W
+            Gate Leakage = 0.00308734 W
+            Runtime Dynamic = 0.851444 W
+
+          Dtlb:
+            Area = 2.24796 mm^2
+            Peak Dynamic = 0.703668 W
+            Subthreshold Leakage = 0.0197321 W
+            Gate Leakage = 0.00422696 W
+            Runtime Dynamic = 1.40735 W
+
+      Execution Unit:
+        Area = 18.9802 mm^2
+        Peak Dynamic = 6.91426 W
+        Subthreshold Leakage = 1.01207 W
+        Gate Leakage = 0.130415 W
+        Runtime Dynamic = 5.51245 W
+
+          Register Files:
+            Area = 4.63431 mm^2
+            Peak Dynamic = 1.07973 W
+            Subthreshold Leakage = 0.00557121 W
+            Gate Leakage = 0.000534421 W
+            Runtime Dynamic = 0.491409 W
+
+              Integer RF:
+                Area = 3.11444 mm^2
+                Peak Dynamic = 0.64479 W
+                Subthreshold Leakage = 0.00348926 W
+                Gate Leakage = 0.000338898 W
+                Runtime Dynamic = 0.43963 W
+
+              Floating Point RF:
+                Area = 1.51987 mm^2
+                Peak Dynamic = 0.434944 W
+                Subthreshold Leakage = 0.00208194 W
+                Gate Leakage = 0.000195523 W
+                Runtime Dynamic = 0.051779 W
+
+          Instruction Scheduler:
+            Area = 2.2958 mm^2
+            Peak Dynamic = 0.682653 W
+            Subthreshold Leakage = 0.0043779 W
+            Gate Leakage = 0.000496354 W
+            Runtime Dynamic = 0.783433 W
+
+              Instruction Window:
+                Area = 0.416485 mm^2
+                Peak Dynamic = 0.230852 W
+                Subthreshold Leakage = 0.001531 W
+                Gate Leakage = 0.000214549 W
+                Runtime Dynamic = 0.308242 W
+
+              FP Instruction Window:
+                Area = 0.160067 mm^2
+                Peak Dynamic = 0.0899719 W
+                Subthreshold Leakage = 0.000573841 W
+                Gate Leakage = 9.08104e-05 W
+                Runtime Dynamic = 0.113361 W
+
+              ROB:
+                Area = 1.71925 mm^2
+                Peak Dynamic = 0.361829 W
+                Subthreshold Leakage = 0.00227307 W
+                Gate Leakage = 0.000190995 W
+                Runtime Dynamic = 0.361829 W
+
+          Integer ALUs (Count: 4 ):
+            Area = 2.56256 mm^2
+            Peak Dynamic = 1.45952 W
+            Subthreshold Leakage = 0.514377 W
+            Gate Leakage = 0.0657924 W
+            Runtime Dynamic = 1.12031 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 9.317 mm^2
+            Peak Dynamic = 1.32571 W
+            Subthreshold Leakage = 0.467545 W
+            Gate Leakage = 0.0598023 W
+            Runtime Dynamic = 1.32571 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0521609 mm^2
+            Peak Dynamic = 2.15212 W
+            Subthreshold Leakage = 0.0139887 W
+            Gate Leakage = 0.00178925 W
+            Runtime Dynamic = 1.79159 W
+
+*****************************************************************************************
+L2
+      Area = 62.2653 mm^2
+      Peak Dynamic = 1.42987 W
+      Subthreshold Leakage = 1.65481 W
+      Gate Leakage = 0.00860545 W
+      Runtime Dynamic = 2.73329 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 0.533824 mm^2
+      Peak Dynamic = 0.275566 W
+      Subthreshold Leakage = 0.00929753 W
+      Gate Leakage = 0.00179126 W
+      Runtime Dynamic = 0.193681 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 3.54463 mm^2
+      Peak Dynamic = 1.65655 W
+      Subthreshold Leakage = 0.0225194 W
+      Gate Leakage = 0.00285586 W
+      Runtime Dynamic = 0.573656 W
+
+      Front End Engine:
+        Area = 1.72828 mm^2
+        Peak Dynamic = 0.389588 W
+        Subthreshold Leakage = 0.00246696 W
+        Gate Leakage = 0.000291005 W
+        Runtime Dynamic = 0.0911898 W
+
+      Transaction Engine:
+        Area = 0.75308 mm^2
+        Peak Dynamic = 1.13896 W
+        Subthreshold Leakage = 0.00831402 W
+        Gate Leakage = 0.00106342 W
+        Runtime Dynamic = 0.341678 W
+
+      PHY:
+        Area = 1.06326 mm^2
+        Peak Dynamic = 0.128 W
+        Subthreshold Leakage = 0.0117384 W
+        Gate Leakage = 0.00150143 W
+        Runtime Dynamic = 0.140788 W
+
+*****************************************************************************************
+NOC
+      Area = 8.77595 mm^2
+      Peak Dynamic = 6.17873 W
+      Subthreshold Leakage = 0.108357 W
+      Gate Leakage = 0.0139259 W
+      Runtime Dynamic = 0.963385 W
+
+      Router: 
+        Area = 8.3047 mm^2
+        Peak Dynamic = 2.78895 W
+        Subthreshold Leakage = 0.0606175 W
+        Gate Leakage = 0.00781974 W
+        Runtime Dynamic = 0.398421 W
+
+            Virtual Channel Buffer:
+              Area = 4.2978 mm^2
+              Peak Dynamic = 2.31409 W
+              Subthreshold Leakage = 0.028002 W
+              Gate Leakage = 0.00227471 W
+              Runtime Dynamic = 0.330584 W
+
+            Crossbar:
+              Area = 0.160538 mm^2
+              Peak Dynamic = 0.437862 W
+              Subthreshold Leakage = 0.0325996 W
+              Gate Leakage = 0.00554292 W
+              Runtime Dynamic = 0.0625517 W
+
+            Arbiter:
+              Peak Dynamic = 0.0370018 W
+              Subthreshold Leakage = 1.5858e-05 W
+              Gate Leakage = 2.11117e-06 W
+              Runtime Dynamic = 0.00528597 W
+
+      Per Router Links: 
+        Area = 0.471256 mm^2
+        Peak Dynamic = 3.38978 W
+        Subthreshold Leakage = 0.0477391 W
+        Gate Leakage = 0.00610616 W
+        Runtime Dynamic = 0.564963 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/Penryn b/ext/mcpat/results/Penryn
new file mode 100644 (file)
index 0000000..af39390
--- /dev/null
@@ -0,0 +1,315 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 45 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3700
+
+*****************************************************************************************
+Processor: 
+  Area = 92.2661 mm^2
+  Peak Power = 61.0228 W
+  Total Leakage = 10.8609 W
+  Peak Dynamic = 50.1619 W
+  Subthreshold Leakage = 10.2773 W
+  Gate Leakage = 0.583567 W
+  Runtime Dynamic = 69.6347 W
+
+  Total Cores: 2 cores 
+  Device Type= ITRS high performance device type
+    Area = 48.2438 mm^2
+    Peak Dynamic = 39.6676 W
+    Subthreshold Leakage = 6.96165 W
+    Gate Leakage = 0.541077 W
+    Runtime Dynamic = 51.4987 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 43.1009 mm^2
+    Peak Dynamic = 6.43272 W
+    Subthreshold Leakage = 3.28049 W
+    Gate Leakage = 0.0386655 W
+    Runtime Dynamic = 13.716 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 0.921404 mm^2
+    Peak Dynamic = 4.06164 W
+    Subthreshold Leakage = 0.035183 W
+    Gate Leakage = 0.00382481 W
+    Runtime Dynamic = 4.42002 W
+
+*****************************************************************************************
+Core:
+      Area = 24.1219 mm^2
+      Peak Dynamic = 19.8338 W
+      Subthreshold Leakage = 3.48083 W
+      Gate Leakage = 0.270538 W
+      Runtime Dynamic = 51.4987 W
+
+      Instruction Fetch Unit:
+        Area = 3.13582 mm^2
+        Peak Dynamic = 2.49774 W
+        Subthreshold Leakage = 0.421089 W
+        Gate Leakage = 0.0246791 W
+        Runtime Dynamic = 2.42869 W
+
+          Instruction Cache:
+            Area = 0.702441 mm^2
+            Peak Dynamic = 0.419702 W
+            Subthreshold Leakage = 0.0413175 W
+            Gate Leakage = 0.00175164 W
+            Runtime Dynamic = 0.487111 W
+
+          Branch Target Buffer:
+            Area = 0.349484 mm^2
+            Peak Dynamic = 0.0903353 W
+            Subthreshold Leakage = 0.0243658 W
+            Gate Leakage = 0.000966387 W
+            Runtime Dynamic = 0.361341 W
+
+          Branch Predictor:
+            Area = 0.153017 mm^2
+            Peak Dynamic = 0.0718712 W
+            Subthreshold Leakage = 0.0142615 W
+            Gate Leakage = 0.000619154 W
+            Runtime Dynamic = 0.0647272 W
+
+              Global Predictor:
+                Area = 0.0475693 mm^2
+                Peak Dynamic = 0.0231158 W
+                Subthreshold Leakage = 0.00544747 W
+                Gate Leakage = 0.000234591 W
+                Runtime Dynamic = 0.0245764 W
+
+              Local Predictor:
+              L1_Local Predictor:
+                Area = 0.0239764 mm^2
+                Peak Dynamic = 0.0142817 W
+                Subthreshold Leakage = 0.00265926 W
+                Gate Leakage = 0.00011608 W
+                Runtime Dynamic = 0.0155731 W
+
+              L2_Local Predictor:
+                Area = 0.012121 mm^2
+                Peak Dynamic = 0.00767395 W
+                Subthreshold Leakage = 0.00143248 W
+                Gate Leakage = 6.77717e-05 W
+                Runtime Dynamic = 0.00837399 W
+
+              Chooser:
+                Area = 0.0475693 mm^2
+                Peak Dynamic = 0.0231158 W
+                Subthreshold Leakage = 0.00544747 W
+                Gate Leakage = 0.000234591 W
+                Runtime Dynamic = 0.0245764 W
+
+              RAS:
+                Area = 0.0217815 mm^2
+                Peak Dynamic = 0.0113578 W
+                Subthreshold Leakage = 0.000707258 W
+                Gate Leakage = 3.38921e-05 W
+                Runtime Dynamic = 1.2459e-06 W
+
+          Instruction Buffer:
+            Area = 0.0278406 mm^2
+            Peak Dynamic = 0.282368 W
+            Subthreshold Leakage = 0.000861686 W
+            Gate Leakage = 3.91839e-05 W
+            Runtime Dynamic = 0.188245 W
+
+          Instruction Decoder:
+            Area = 1.85799 mm^2
+            Peak Dynamic = 1.32726 W
+            Subthreshold Leakage = 0.325606 W
+            Gate Leakage = 0.0185411 W
+            Runtime Dynamic = 1.32726 W
+
+      Renaming Unit:
+        Area = 1.02517 mm^2
+        Peak Dynamic = 2.25746 W
+        Subthreshold Leakage = 0.042129 W
+        Gate Leakage = 0.00480502 W
+        Runtime Dynamic = 1.55315 W
+
+          Int Front End RAT:
+            Area = 0.59725 mm^2
+            Peak Dynamic = 1.25286 W
+            Subthreshold Leakage = 0.0159587 W
+            Gate Leakage = 0.00122436 W
+            Runtime Dynamic = 1.11309 W
+
+          FP Front End RAT:
+            Area = 0.350662 mm^2
+            Peak Dynamic = 0.652971 W
+            Subthreshold Leakage = 0.0110219 W
+            Gate Leakage = 0.00079321 W
+            Runtime Dynamic = 0.326485 W
+
+          Free List:
+            Area = 0.0322035 mm^2
+            Peak Dynamic = 0.0454309 W
+            Subthreshold Leakage = 0.000471802 W
+            Gate Leakage = 2.57995e-05 W
+            Runtime Dynamic = 0.113577 W
+
+      Load Store Unit:
+        Area = 7.24152 mm^2
+        Peak Dynamic = 6.57278 W
+        Subthreshold Leakage = 0.310798 W
+        Gate Leakage = 0.0358085 W
+        Runtime Dynamic = 34.9208 W
+
+          Data Cache:
+            Area = 4.65034 mm^2
+            Peak Dynamic = 5.03369 W
+            Subthreshold Leakage = 0.237004 W
+            Gate Leakage = 0.0253255 W
+            Runtime Dynamic = 33.601 W
+
+          LoadQ:
+            Area = 0.260806 mm^2
+            Peak Dynamic = 0.132332 W
+            Subthreshold Leakage = 0.00523814 W
+            Gate Leakage = 0.000359005 W
+            Runtime Dynamic = 0.0661662 W
+
+          StoreQ:
+            Area = 1.06006 mm^2
+            Peak Dynamic = 1.25365 W
+            Subthreshold Leakage = 0.0538794 W
+            Gate Leakage = 0.00736236 W
+            Runtime Dynamic = 1.25365 W
+
+      Memory Management Unit:
+        Area = 0.363299 mm^2
+        Peak Dynamic = 0.610831 W
+        Subthreshold Leakage = 0.0388017 W
+        Gate Leakage = 0.00431691 W
+        Runtime Dynamic = 1.29234 W
+
+          Itlb:
+            Area = 0.0590462 mm^2
+            Peak Dynamic = 0.116192 W
+            Subthreshold Leakage = 0.00608044 W
+            Gate Leakage = 0.000398475 W
+            Runtime Dynamic = 0.232386 W
+
+          Dtlb:
+            Area = 0.259199 mm^2
+            Peak Dynamic = 0.264986 W
+            Subthreshold Leakage = 0.0180446 W
+            Gate Leakage = 0.00115678 W
+            Runtime Dynamic = 1.05995 W
+
+      Execution Unit:
+        Area = 7.9594 mm^2
+        Peak Dynamic = 7.89497 W
+        Subthreshold Leakage = 1.28761 W
+        Gate Leakage = 0.0977152 W
+        Runtime Dynamic = 11.3037 W
+
+          Register Files:
+            Area = 0.528076 mm^2
+            Peak Dynamic = 0.554172 W
+            Subthreshold Leakage = 0.00459231 W
+            Gate Leakage = 0.000305031 W
+            Runtime Dynamic = 0.283985 W
+
+              Integer RF:
+                Area = 0.336446 mm^2
+                Peak Dynamic = 0.461344 W
+                Subthreshold Leakage = 0.00257976 W
+                Gate Leakage = 0.00018025 W
+                Runtime Dynamic = 0.247149 W
+
+              Floating Point RF:
+                Area = 0.19163 mm^2
+                Peak Dynamic = 0.0928276 W
+                Subthreshold Leakage = 0.00201255 W
+                Gate Leakage = 0.000124781 W
+                Runtime Dynamic = 0.0368364 W
+
+          Instruction Scheduler:
+            Area = 1.97424 mm^2
+            Peak Dynamic = 1.76421 W
+            Subthreshold Leakage = 0.0212898 W
+            Gate Leakage = 0.0014052 W
+            Runtime Dynamic = 1.96388 W
+
+              Instruction Window:
+                Area = 0.889691 mm^2
+                Peak Dynamic = 0.468182 W
+                Subthreshold Leakage = 0.0081033 W
+                Gate Leakage = 0.000620258 W
+                Runtime Dynamic = 0.601258 W
+
+              FP Instruction Window:
+                Area = 0.347423 mm^2
+                Peak Dynamic = 0.230453 W
+                Subthreshold Leakage = 0.00381664 W
+                Gate Leakage = 0.000293336 W
+                Runtime Dynamic = 0.29704 W
+
+              ROB:
+                Area = 0.737129 mm^2
+                Peak Dynamic = 1.06558 W
+                Subthreshold Leakage = 0.00936988 W
+                Gate Leakage = 0.000491606 W
+                Runtime Dynamic = 1.06558 W
+
+          Integer ALUs (Count: 6 ):
+            Area = 0.47087 mm^2
+            Peak Dynamic = 2.2206 W
+            Subthreshold Leakage = 0.295671 W
+            Gate Leakage = 0.0221076 W
+            Runtime Dynamic = 1.14549 W
+
+          Floating Point Units (FPUs) (Count: 2 ):
+            Area = 4.6585 mm^2
+            Peak Dynamic = 0.708407 W
+            Subthreshold Leakage = 0.731296 W
+            Gate Leakage = 0.0546797 W
+            Runtime Dynamic = 1.28625 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.235435 mm^2
+            Peak Dynamic = 0.257249 W
+            Subthreshold Leakage = 0.147835 W
+            Gate Leakage = 0.0110538 W
+            Runtime Dynamic = 1.57424 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0472187 mm^2
+            Peak Dynamic = 2.08413 W
+            Subthreshold Leakage = 0.0722513 W
+            Gate Leakage = 0.00540229 W
+            Runtime Dynamic = 5.04986 W
+
+*****************************************************************************************
+L2
+      Area = 43.1009 mm^2
+      Peak Dynamic = 6.43272 W
+      Subthreshold Leakage = 3.28049 W
+      Gate Leakage = 0.0386655 W
+      Runtime Dynamic = 13.716 W
+
+*****************************************************************************************
+BUSES
+      Area = 0.921404 mm^2
+      Peak Dynamic = 4.06164 W
+      Subthreshold Leakage = 0.035183 W
+      Gate Leakage = 0.00382481 W
+      Runtime Dynamic = 4.42002 W
+
+      Bus: 
+        Area = 0.921404 mm^2
+        Peak Dynamic = 4.06164 W
+        Subthreshold Leakage = 0.035183 W
+        Gate Leakage = 0.00382481 W
+        Runtime Dynamic = 4.42002 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/T1 b/ext/mcpat/results/T1
new file mode 100644 (file)
index 0000000..f63e51c
--- /dev/null
@@ -0,0 +1,296 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 90 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1200
+
+*****************************************************************************************
+Processor: 
+  Area = 283.287 mm^2
+  Peak Power = 55.0318 W
+  Total Leakage = 9.78078 W
+  Peak Dynamic = 45.2511 W
+  Subthreshold Leakage = 8.64906 W
+  Gate Leakage = 1.13172 W
+  Runtime Dynamic = 45.5013 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 117.887 mm^2
+    Peak Dynamic = 28.1307 W
+    Subthreshold Leakage = 5.19354 W
+    Gate Leakage = 0.730037 W
+    Runtime Dynamic = 18.917 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 116.308 mm^2
+    Peak Dynamic = 5.51367 W
+    Subthreshold Leakage = 2.41316 W
+    Gate Leakage = 0.242513 W
+    Runtime Dynamic = 4.00707 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 8.77473 mm^2
+    Peak Dynamic = 3.38588 W
+    Subthreshold Leakage = 0.224524 W
+    Gate Leakage = 0.0320801 W
+    Runtime Dynamic = 15.1158 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 8.87598 mm^2
+    Peak Dynamic = 3.67515 W
+    Subthreshold Leakage = 0.488892 W
+    Gate Leakage = 0.0852308 W
+    Runtime Dynamic = 2.20509 W
+
+  Total MCs: 
+  Device Type= ITRS high performance device type
+    Area = 31.441 mm^2
+    Peak Dynamic = 4.5457 W
+    Subthreshold Leakage = 0.328953 W
+    Gate Leakage = 0.0418558 W
+    Runtime Dynamic = 5.25637 W
+
+*****************************************************************************************
+Core:
+      Area = 14.7359 mm^2
+      Peak Dynamic = 3.51633 W
+      Subthreshold Leakage = 0.649192 W
+      Gate Leakage = 0.0912546 W
+      Runtime Dynamic = 18.917 W
+
+      Instruction Fetch Unit:
+        Area = 3.60967 mm^2
+        Peak Dynamic = 0.560912 W
+        Subthreshold Leakage = 0.0396492 W
+        Gate Leakage = 0.00709504 W
+        Runtime Dynamic = 3.76593 W
+
+          Instruction Cache:
+            Area = 3.41818 mm^2
+            Peak Dynamic = 0.308492 W
+            Subthreshold Leakage = 0.0286475 W
+            Gate Leakage = 0.00418329 W
+            Runtime Dynamic = 0.95332 W
+
+          Instruction Buffer:
+            Area = 0.0122742 mm^2
+            Peak Dynamic = 0.0121268 W
+            Subthreshold Leakage = 0.0002042 W
+            Gate Leakage = 1.78658e-05 W
+            Runtime Dynamic = 0.0970143 W
+
+          Instruction Decoder:
+            Area = 0.0229327 mm^2
+            Peak Dynamic = 0.169467 W
+            Subthreshold Leakage = 0.00259055 W
+            Gate Leakage = 0.000252139 W
+            Runtime Dynamic = 1.35574 W
+
+      Load Store Unit:
+        Area = 3.07616 mm^2
+        Peak Dynamic = 0.390349 W
+        Subthreshold Leakage = 0.0362126 W
+        Gate Leakage = 0.00713432 W
+        Runtime Dynamic = 3.85623 W
+
+          Data Cache:
+            Area = 1.47986 mm^2
+            Peak Dynamic = 0.191211 W
+            Subthreshold Leakage = 0.0157454 W
+            Gate Leakage = 0.00208738 W
+            Runtime Dynamic = 0.443377 W
+
+          Load/Store Queue:
+            Area = 1.17458 mm^2
+            Peak Dynamic = 0.128312 W
+            Subthreshold Leakage = 0.0122603 W
+            Gate Leakage = 0.0024052 W
+            Runtime Dynamic = 2.05299 W
+
+      Memory Management Unit:
+        Area = 1.27751 mm^2
+        Peak Dynamic = 0.324071 W
+        Subthreshold Leakage = 0.0192968 W
+        Gate Leakage = 0.0049902 W
+        Runtime Dynamic = 2.53591 W
+
+          Itlb:
+            Area = 0.560615 mm^2
+            Peak Dynamic = 0.117604 W
+            Subthreshold Leakage = 0.00554488 W
+            Gate Leakage = 0.00117423 W
+            Runtime Dynamic = 0.940838 W
+
+          Dtlb:
+            Area = 0.560615 mm^2
+            Peak Dynamic = 0.0294011 W
+            Subthreshold Leakage = 0.00554488 W
+            Gate Leakage = 0.00117423 W
+            Runtime Dynamic = 0.235211 W
+
+      Execution Unit:
+        Area = 3.47025 mm^2
+        Peak Dynamic = 2.241 W
+        Subthreshold Leakage = 0.222601 W
+        Gate Leakage = 0.0296426 W
+        Runtime Dynamic = 8.75894 W
+
+          Register Files:
+            Area = 1.38355 mm^2
+            Peak Dynamic = 0.0746572 W
+            Subthreshold Leakage = 0.00827136 W
+            Gate Leakage = 0.000628178 W
+            Runtime Dynamic = 0.320633 W
+
+              Integer RF:
+                Area = 0.592652 mm^2
+                Peak Dynamic = 0.0582404 W
+                Subthreshold Leakage = 0.00161128 W
+                Gate Leakage = 0.000148771 W
+                Runtime Dynamic = 0.312722 W
+
+              Floating Point RF:
+                Area = 0.592652 mm^2
+                Peak Dynamic = 0.0164168 W
+                Subthreshold Leakage = 0.00161128 W
+                Gate Leakage = 0.000148771 W
+                Runtime Dynamic = 0.00783962 W
+
+              Register Windows:
+                Area = 0.198243 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00504879 W
+                Gate Leakage = 0.000330636 W
+                Runtime Dynamic = 7.11291e-05 W
+
+          Instruction Scheduler:
+            Area = 0.04377 mm^2
+            Peak Dynamic = 0.0284368 W
+            Subthreshold Leakage = 0.000336066 W
+            Gate Leakage = 5.10703e-05 W
+            Runtime Dynamic = 0.244528 W
+
+              Instruction Window:
+                Area = 0.04377 mm^2
+                Peak Dynamic = 0.0284368 W
+                Subthreshold Leakage = 0.000336066 W
+                Gate Leakage = 5.10703e-05 W
+                Runtime Dynamic = 0.244528 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.16016 mm^2
+            Peak Dynamic = 0.305285 W
+            Subthreshold Leakage = 0.0321485 W
+            Gate Leakage = 0.00411202 W
+            Runtime Dynamic = 2.71365 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 1.16463 mm^2
+            Peak Dynamic = 0.0508808 W
+            Subthreshold Leakage = 0.0584431 W
+            Gate Leakage = 0.00747528 W
+            Runtime Dynamic = 0.101762 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.48048 mm^2
+            Peak Dynamic = 0.339206 W
+            Subthreshold Leakage = 0.0964456 W
+            Gate Leakage = 0.0123361 W
+            Runtime Dynamic = 0.678411 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0813807 mm^2
+            Peak Dynamic = 1.18756 W
+            Subthreshold Leakage = 0.0187498 W
+            Gate Leakage = 0.00239823 W
+            Runtime Dynamic = 3.3401 W
+
+*****************************************************************************************
+L2
+      Area = 29.0771 mm^2
+      Peak Dynamic = 1.37842 W
+      Subthreshold Leakage = 0.603289 W
+      Gate Leakage = 0.0606283 W
+      Runtime Dynamic = 4.00707 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 2.19368 mm^2
+      Peak Dynamic = 0.84647 W
+      Subthreshold Leakage = 0.0561311 W
+      Gate Leakage = 0.00802003 W
+      Runtime Dynamic = 15.1158 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 7.86025 mm^2
+      Peak Dynamic = 1.13642 W
+      Subthreshold Leakage = 0.0822383 W
+      Gate Leakage = 0.0104639 W
+      Runtime Dynamic = 5.25637 W
+
+      Front End Engine:
+        Area = 0.63078 mm^2
+        Peak Dynamic = 0.0549429 W
+        Subthreshold Leakage = 0.00242476 W
+        Gate Leakage = 0.00025524 W
+        Runtime Dynamic = 0.241753 W
+
+      Transaction Engine:
+        Area = 2.59502 mm^2
+        Peak Dynamic = 0.569482 W
+        Subthreshold Leakage = 0.0286491 W
+        Gate Leakage = 0.00366442 W
+        Runtime Dynamic = 2.50577 W
+
+      PHY:
+        Area = 4.63445 mm^2
+        Peak Dynamic = 0.512 W
+        Subthreshold Leakage = 0.0511644 W
+        Gate Leakage = 0.00654429 W
+        Runtime Dynamic = 2.50885 W
+
+*****************************************************************************************
+NOC
+      Area = 8.87598 mm^2
+      Peak Dynamic = 3.67515 W
+      Subthreshold Leakage = 0.488892 W
+      Gate Leakage = 0.0852308 W
+      Runtime Dynamic = 2.20509 W
+
+      Router: 
+        Area = 4.43799 mm^2
+        Peak Dynamic = 1.83757 W
+        Subthreshold Leakage = 0.244446 W
+        Gate Leakage = 0.0426154 W
+        Runtime Dynamic = 2.20509 W
+
+            Virtual Channel Buffer:
+              Area = 1.22928 mm^2
+              Peak Dynamic = 0.0508654 W
+              Subthreshold Leakage = 0.000485491 W
+              Gate Leakage = 7.24213e-05 W
+              Runtime Dynamic = 0.0610385 W
+
+            Crossbar:
+              Area = 1.35717 mm^2
+              Peak Dynamic = 1.77185 W
+              Subthreshold Leakage = 0.243949 W
+              Gate Leakage = 0.0425414 W
+              Runtime Dynamic = 2.12622 W
+
+            Arbiter:
+              Peak Dynamic = 0.0148566 W
+              Subthreshold Leakage = 1.15783e-05 W
+              Gate Leakage = 1.54103e-06 W
+              Runtime Dynamic = 0.0178279 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/T1_DC_64 b/ext/mcpat/results/T1_DC_64
new file mode 100644 (file)
index 0000000..cdb0a1b
--- /dev/null
@@ -0,0 +1,270 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+line64
+size1.04858e+06
+line9
+size1.04858e+06
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 22 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3500
+
+*****************************************************************************************
+Processor: 
+  Area = 322.362 mm^2
+  Peak Power = 112.557 W
+  Total Leakage = 28.0714 W
+  Peak Dynamic = 84.4853 W
+  Subthreshold Leakage = 27.7571 W
+  Gate Leakage = 0.314289 W
+  Runtime Dynamic = 13.4278 W
+
+  Total Cores: 64 cores 
+  Device Type= ITRS high performance device type
+    Area = 87.1986 mm^2
+    Peak Dynamic = 42.426 W
+    Subthreshold Leakage = 7.80232 W
+    Gate Leakage = 0.0799149 W
+    Runtime Dynamic = 9.61388 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 161.532 mm^2
+    Peak Dynamic = 21.1059 W
+    Subthreshold Leakage = 8.9583 W
+    Gate Leakage = 0.100733 W
+    Runtime Dynamic = 1.14063 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 22.1741 mm^2
+    Peak Dynamic = 0.831407 W
+    Subthreshold Leakage = 1.57123 W
+    Gate Leakage = 0.0148674 W
+    Runtime Dynamic = 0.175856 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 51.4571 mm^2
+    Peak Dynamic = 20.122 W
+    Subthreshold Leakage = 9.42527 W
+    Gate Leakage = 0.118774 W
+    Runtime Dynamic = 2.49747 W
+
+*****************************************************************************************
+Core:
+      Area = 1.36248 mm^2
+      Peak Dynamic = 0.662906 W
+      Subthreshold Leakage = 0.121911 W
+      Gate Leakage = 0.00124867 W
+      Runtime Dynamic = 9.61388 W
+
+      Instruction Fetch Unit:
+        Area = 0.140786 mm^2
+        Peak Dynamic = 0.0863256 W
+        Subthreshold Leakage = 0.00636762 W
+        Gate Leakage = 7.4998e-05 W
+        Runtime Dynamic = 2.08883 W
+
+          Instruction Cache:
+            Area = 0.129377 mm^2
+            Peak Dynamic = 0.0476007 W
+            Subthreshold Leakage = 0.00381804 W
+            Gate Leakage = 2.35266e-05 W
+            Runtime Dynamic = 0.0698158 W
+
+          Instruction Buffer:
+            Area = 0.000754971 mm^2
+            Peak Dynamic = 0.00238165 W
+            Subthreshold Leakage = 4.99334e-05 W
+            Gate Leakage = 3.27157e-07 W
+            Runtime Dynamic = 0.0190532 W
+
+          Instruction Decoder:
+            Area = 0.00131543 mm^2
+            Peak Dynamic = 0.0246042 W
+            Subthreshold Leakage = 0.000538954 W
+            Gate Leakage = 3.91915e-06 W
+            Runtime Dynamic = 0.196833 W
+
+      Load Store Unit:
+        Area = 0.0977414 mm^2
+        Peak Dynamic = 0.0587123 W
+        Subthreshold Leakage = 0.00580883 W
+        Gate Leakage = 7.48788e-05 W
+        Runtime Dynamic = 2.07447 W
+
+          Data Cache:
+            Area = 0.0569223 mm^2
+            Peak Dynamic = 0.0329939 W
+            Subthreshold Leakage = 0.00249221 W
+            Gate Leakage = 1.63814e-05 W
+            Runtime Dynamic = 0.0476753 W
+
+          Load/Store Queue:
+            Area = 0.023444 mm^2
+            Peak Dynamic = 0.0139792 W
+            Subthreshold Leakage = 0.00135593 W
+            Gate Leakage = 1.12722e-05 W
+            Runtime Dynamic = 0.223667 W
+
+      Memory Management Unit:
+        Area = 0.0313997 mm^2
+        Peak Dynamic = 0.0446647 W
+        Subthreshold Leakage = 0.0029577 W
+        Gate Leakage = 5.57335e-05 W
+        Runtime Dynamic = 1.92566 W
+
+          Itlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.0122535 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0980282 W
+
+          Dtlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.00306337 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0245072 W
+
+      Execution Unit:
+        Area = 0.299667 mm^2
+        Peak Dynamic = 0.473204 W
+        Subthreshold Leakage = 0.0379242 W
+        Gate Leakage = 0.000384077 W
+        Runtime Dynamic = 3.52491 W
+
+          Register Files:
+            Area = 0.0598365 mm^2
+            Peak Dynamic = 0.0168768 W
+            Subthreshold Leakage = 0.0020814 W
+            Gate Leakage = 1.24237e-05 W
+            Runtime Dynamic = 0.072481 W
+
+              Integer RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.0131657 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0706931 W
+
+              Floating Point RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.00371113 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0017722 W
+
+              Register Windows:
+                Area = 0.0118221 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00118307 W
+                Gate Leakage = 5.76149e-06 W
+                Runtime Dynamic = 1.56951e-05 W
+
+          Instruction Scheduler:
+            Area = 0.00263062 mm^2
+            Peak Dynamic = 0.00540689 W
+            Subthreshold Leakage = 8.27524e-05 W
+            Gate Leakage = 9.38261e-07 W
+            Runtime Dynamic = 0.0464411 W
+
+              Instruction Window:
+                Area = 0.00263062 mm^2
+                Peak Dynamic = 0.00540689 W
+                Subthreshold Leakage = 8.27524e-05 W
+                Gate Leakage = 9.38261e-07 W
+                Runtime Dynamic = 0.0464411 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.0384544 mm^2
+            Peak Dynamic = 0.0946992 W
+            Subthreshold Leakage = 0.00667865 W
+            Gate Leakage = 6.39207e-05 W
+            Runtime Dynamic = 0.841771 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 0.0695899 mm^2
+            Peak Dynamic = 0.0157832 W
+            Subthreshold Leakage = 0.00302155 W
+            Gate Leakage = 2.89189e-05 W
+            Runtime Dynamic = 0.0315664 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.115363 mm^2
+            Peak Dynamic = 0.105221 W
+            Subthreshold Leakage = 0.020036 W
+            Gate Leakage = 0.000191762 W
+            Runtime Dynamic = 0.210443 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00445381 mm^2
+            Peak Dynamic = 0.192955 W
+            Subthreshold Leakage = 0.00406321 W
+            Gate Leakage = 3.88886e-05 W
+            Runtime Dynamic = 0.519078 W
+
+*****************************************************************************************
+L2
+      Area = 2.52394 mm^2
+      Peak Dynamic = 0.32978 W
+      Subthreshold Leakage = 0.139973 W
+      Gate Leakage = 0.00157395 W
+      Runtime Dynamic = 1.14063 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 2.77176 mm^2
+      Peak Dynamic = 0.103926 W
+      Subthreshold Leakage = 0.196403 W
+      Gate Leakage = 0.00185842 W
+      Runtime Dynamic = 0.175856 W
+
+*****************************************************************************************
+NOC
+      Area = 51.4571 mm^2
+      Peak Dynamic = 20.122 W
+      Subthreshold Leakage = 9.42527 W
+      Gate Leakage = 0.118774 W
+      Runtime Dynamic = 2.49747 W
+
+      Router: 
+        Area = 0.578434 mm^2
+        Peak Dynamic = 0.184548 W
+        Subthreshold Leakage = 0.125515 W
+        Gate Leakage = 0.0016409 W
+        Runtime Dynamic = 1.32875 W
+
+            Virtual Channel Buffer:
+              Area = 0.159162 mm^2
+              Peak Dynamic = 0.00394081 W
+              Subthreshold Leakage = 0.000194478 W
+              Gate Leakage = 1.84946e-06 W
+              Runtime Dynamic = 0.0283738 W
+
+            Crossbar:
+              Area = 0.160976 mm^2
+              Peak Dynamic = 0.179891 W
+              Subthreshold Leakage = 0.12532 W
+              Gate Leakage = 0.00163905 W
+              Runtime Dynamic = 1.29522 W
+
+            Arbiter:
+              Peak Dynamic = 0.000716053 W
+              Subthreshold Leakage = 3.67148e-07 W
+              Gate Leakage = 3.86991e-09 W
+              Runtime Dynamic = 0.00515558 W
+
+      Per Router Links: 
+        Area = 0.225583 mm^2
+        Peak Dynamic = 0.129858 W
+        Subthreshold Leakage = 0.0217549 W
+        Gate Leakage = 0.000214933 W
+        Runtime Dynamic = 1.16872 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/T1_SBT_64 b/ext/mcpat/results/T1_SBT_64
new file mode 100644 (file)
index 0000000..ec8968a
--- /dev/null
@@ -0,0 +1,252 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+line72
+size1.17965e+06
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 22 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3500
+
+*****************************************************************************************
+Processor: 
+  Area = 321.412 mm^2
+  Peak Power = 114.076 W
+  Total Leakage = 27.4353 W
+  Peak Dynamic = 86.6406 W
+  Subthreshold Leakage = 27.1256 W
+  Gate Leakage = 0.309772 W
+  Runtime Dynamic = 13.4064 W
+
+  Total Cores: 64 cores 
+  Device Type= ITRS high performance device type
+    Area = 87.1986 mm^2
+    Peak Dynamic = 42.426 W
+    Subthreshold Leakage = 7.80232 W
+    Gate Leakage = 0.0799149 W
+    Runtime Dynamic = 9.61388 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 182.778 mm^2
+    Peak Dynamic = 24.1051 W
+    Subthreshold Leakage = 9.90006 W
+    Gate Leakage = 0.111104 W
+    Runtime Dynamic = 1.29686 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 51.4353 mm^2
+    Peak Dynamic = 20.1095 W
+    Subthreshold Leakage = 9.42317 W
+    Gate Leakage = 0.118753 W
+    Runtime Dynamic = 2.4957 W
+
+*****************************************************************************************
+Core:
+      Area = 1.36248 mm^2
+      Peak Dynamic = 0.662906 W
+      Subthreshold Leakage = 0.121911 W
+      Gate Leakage = 0.00124867 W
+      Runtime Dynamic = 9.61388 W
+
+      Instruction Fetch Unit:
+        Area = 0.140786 mm^2
+        Peak Dynamic = 0.0863256 W
+        Subthreshold Leakage = 0.00636762 W
+        Gate Leakage = 7.4998e-05 W
+        Runtime Dynamic = 2.08883 W
+
+          Instruction Cache:
+            Area = 0.129377 mm^2
+            Peak Dynamic = 0.0476007 W
+            Subthreshold Leakage = 0.00381804 W
+            Gate Leakage = 2.35266e-05 W
+            Runtime Dynamic = 0.0698158 W
+
+          Instruction Buffer:
+            Area = 0.000754971 mm^2
+            Peak Dynamic = 0.00238165 W
+            Subthreshold Leakage = 4.99334e-05 W
+            Gate Leakage = 3.27157e-07 W
+            Runtime Dynamic = 0.0190532 W
+
+          Instruction Decoder:
+            Area = 0.00131543 mm^2
+            Peak Dynamic = 0.0246042 W
+            Subthreshold Leakage = 0.000538954 W
+            Gate Leakage = 3.91915e-06 W
+            Runtime Dynamic = 0.196833 W
+
+      Load Store Unit:
+        Area = 0.0977414 mm^2
+        Peak Dynamic = 0.0587123 W
+        Subthreshold Leakage = 0.00580883 W
+        Gate Leakage = 7.48788e-05 W
+        Runtime Dynamic = 2.07447 W
+
+          Data Cache:
+            Area = 0.0569223 mm^2
+            Peak Dynamic = 0.0329939 W
+            Subthreshold Leakage = 0.00249221 W
+            Gate Leakage = 1.63814e-05 W
+            Runtime Dynamic = 0.0476753 W
+
+          Load/Store Queue:
+            Area = 0.023444 mm^2
+            Peak Dynamic = 0.0139792 W
+            Subthreshold Leakage = 0.00135593 W
+            Gate Leakage = 1.12722e-05 W
+            Runtime Dynamic = 0.223667 W
+
+      Memory Management Unit:
+        Area = 0.0313997 mm^2
+        Peak Dynamic = 0.0446647 W
+        Subthreshold Leakage = 0.0029577 W
+        Gate Leakage = 5.57335e-05 W
+        Runtime Dynamic = 1.92566 W
+
+          Itlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.0122535 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0980282 W
+
+          Dtlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.00306337 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0245072 W
+
+      Execution Unit:
+        Area = 0.299667 mm^2
+        Peak Dynamic = 0.473204 W
+        Subthreshold Leakage = 0.0379242 W
+        Gate Leakage = 0.000384077 W
+        Runtime Dynamic = 3.52491 W
+
+          Register Files:
+            Area = 0.0598365 mm^2
+            Peak Dynamic = 0.0168768 W
+            Subthreshold Leakage = 0.0020814 W
+            Gate Leakage = 1.24237e-05 W
+            Runtime Dynamic = 0.072481 W
+
+              Integer RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.0131657 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0706931 W
+
+              Floating Point RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.00371113 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0017722 W
+
+              Register Windows:
+                Area = 0.0118221 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00118307 W
+                Gate Leakage = 5.76149e-06 W
+                Runtime Dynamic = 1.56951e-05 W
+
+          Instruction Scheduler:
+            Area = 0.00263062 mm^2
+            Peak Dynamic = 0.00540689 W
+            Subthreshold Leakage = 8.27524e-05 W
+            Gate Leakage = 9.38261e-07 W
+            Runtime Dynamic = 0.0464411 W
+
+              Instruction Window:
+                Area = 0.00263062 mm^2
+                Peak Dynamic = 0.00540689 W
+                Subthreshold Leakage = 8.27524e-05 W
+                Gate Leakage = 9.38261e-07 W
+                Runtime Dynamic = 0.0464411 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.0384544 mm^2
+            Peak Dynamic = 0.0946992 W
+            Subthreshold Leakage = 0.00667865 W
+            Gate Leakage = 6.39207e-05 W
+            Runtime Dynamic = 0.841771 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 0.0695899 mm^2
+            Peak Dynamic = 0.0157832 W
+            Subthreshold Leakage = 0.00302155 W
+            Gate Leakage = 2.89189e-05 W
+            Runtime Dynamic = 0.0315664 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.115363 mm^2
+            Peak Dynamic = 0.105221 W
+            Subthreshold Leakage = 0.020036 W
+            Gate Leakage = 0.000191762 W
+            Runtime Dynamic = 0.210443 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00445381 mm^2
+            Peak Dynamic = 0.192955 W
+            Subthreshold Leakage = 0.00406321 W
+            Gate Leakage = 3.88886e-05 W
+            Runtime Dynamic = 0.519078 W
+
+*****************************************************************************************
+L2
+      Area = 2.85591 mm^2
+      Peak Dynamic = 0.376642 W
+      Subthreshold Leakage = 0.154688 W
+      Gate Leakage = 0.001736 W
+      Runtime Dynamic = 1.29686 W
+
+*****************************************************************************************
+NOC
+      Area = 51.4353 mm^2
+      Peak Dynamic = 20.1095 W
+      Subthreshold Leakage = 9.42317 W
+      Gate Leakage = 0.118753 W
+      Runtime Dynamic = 2.4957 W
+
+      Router: 
+        Area = 0.578434 mm^2
+        Peak Dynamic = 0.184548 W
+        Subthreshold Leakage = 0.125515 W
+        Gate Leakage = 0.0016409 W
+        Runtime Dynamic = 1.32875 W
+
+            Virtual Channel Buffer:
+              Area = 0.159162 mm^2
+              Peak Dynamic = 0.00394081 W
+              Subthreshold Leakage = 0.000194478 W
+              Gate Leakage = 1.84946e-06 W
+              Runtime Dynamic = 0.0283738 W
+
+            Crossbar:
+              Area = 0.160976 mm^2
+              Peak Dynamic = 0.179891 W
+              Subthreshold Leakage = 0.12532 W
+              Gate Leakage = 0.00163905 W
+              Runtime Dynamic = 1.29522 W
+
+            Arbiter:
+              Peak Dynamic = 0.000716053 W
+              Subthreshold Leakage = 3.67148e-07 W
+              Gate Leakage = 3.86991e-09 W
+              Runtime Dynamic = 0.00515558 W
+
+      Per Router Links: 
+        Area = 0.225243 mm^2
+        Peak Dynamic = 0.129662 W
+        Subthreshold Leakage = 0.0217221 W
+        Gate Leakage = 0.000214609 W
+        Runtime Dynamic = 1.16696 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/T1_ST_64 b/ext/mcpat/results/T1_ST_64
new file mode 100644 (file)
index 0000000..f3d95b5
--- /dev/null
@@ -0,0 +1,270 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+line64
+size1.04858e+06
+line9
+size8.38861e+06
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 22 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3500
+
+*****************************************************************************************
+Processor: 
+  Area = 358.016 mm^2
+  Peak Power = 168.519 W
+  Total Leakage = 30.8855 W
+  Peak Dynamic = 137.634 W
+  Subthreshold Leakage = 30.5351 W
+  Gate Leakage = 0.350385 W
+  Runtime Dynamic = 84.2366 W
+
+  Total Cores: 64 cores 
+  Device Type= ITRS high performance device type
+    Area = 87.1986 mm^2
+    Peak Dynamic = 42.426 W
+    Subthreshold Leakage = 7.80232 W
+    Gate Leakage = 0.0799149 W
+    Runtime Dynamic = 9.61388 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 161.532 mm^2
+    Peak Dynamic = 21.1059 W
+    Subthreshold Leakage = 8.9583 W
+    Gate Leakage = 0.100733 W
+    Runtime Dynamic = 1.14063 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 57.033 mm^2
+    Peak Dynamic = 53.5219 W
+    Subthreshold Leakage = 4.27249 W
+    Gate Leakage = 0.050206 W
+    Runtime Dynamic = 70.9203 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 52.2524 mm^2
+    Peak Dynamic = 20.5798 W
+    Subthreshold Leakage = 9.50197 W
+    Gate Leakage = 0.119531 W
+    Runtime Dynamic = 2.56185 W
+
+*****************************************************************************************
+Core:
+      Area = 1.36248 mm^2
+      Peak Dynamic = 0.662906 W
+      Subthreshold Leakage = 0.121911 W
+      Gate Leakage = 0.00124867 W
+      Runtime Dynamic = 9.61388 W
+
+      Instruction Fetch Unit:
+        Area = 0.140786 mm^2
+        Peak Dynamic = 0.0863256 W
+        Subthreshold Leakage = 0.00636762 W
+        Gate Leakage = 7.4998e-05 W
+        Runtime Dynamic = 2.08883 W
+
+          Instruction Cache:
+            Area = 0.129377 mm^2
+            Peak Dynamic = 0.0476007 W
+            Subthreshold Leakage = 0.00381804 W
+            Gate Leakage = 2.35266e-05 W
+            Runtime Dynamic = 0.0698158 W
+
+          Instruction Buffer:
+            Area = 0.000754971 mm^2
+            Peak Dynamic = 0.00238165 W
+            Subthreshold Leakage = 4.99334e-05 W
+            Gate Leakage = 3.27157e-07 W
+            Runtime Dynamic = 0.0190532 W
+
+          Instruction Decoder:
+            Area = 0.00131543 mm^2
+            Peak Dynamic = 0.0246042 W
+            Subthreshold Leakage = 0.000538954 W
+            Gate Leakage = 3.91915e-06 W
+            Runtime Dynamic = 0.196833 W
+
+      Load Store Unit:
+        Area = 0.0977414 mm^2
+        Peak Dynamic = 0.0587123 W
+        Subthreshold Leakage = 0.00580883 W
+        Gate Leakage = 7.48788e-05 W
+        Runtime Dynamic = 2.07447 W
+
+          Data Cache:
+            Area = 0.0569223 mm^2
+            Peak Dynamic = 0.0329939 W
+            Subthreshold Leakage = 0.00249221 W
+            Gate Leakage = 1.63814e-05 W
+            Runtime Dynamic = 0.0476753 W
+
+          Load/Store Queue:
+            Area = 0.023444 mm^2
+            Peak Dynamic = 0.0139792 W
+            Subthreshold Leakage = 0.00135593 W
+            Gate Leakage = 1.12722e-05 W
+            Runtime Dynamic = 0.223667 W
+
+      Memory Management Unit:
+        Area = 0.0313997 mm^2
+        Peak Dynamic = 0.0446647 W
+        Subthreshold Leakage = 0.0029577 W
+        Gate Leakage = 5.57335e-05 W
+        Runtime Dynamic = 1.92566 W
+
+          Itlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.0122535 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0980282 W
+
+          Dtlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.00306337 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0245072 W
+
+      Execution Unit:
+        Area = 0.299667 mm^2
+        Peak Dynamic = 0.473204 W
+        Subthreshold Leakage = 0.0379242 W
+        Gate Leakage = 0.000384077 W
+        Runtime Dynamic = 3.52491 W
+
+          Register Files:
+            Area = 0.0598365 mm^2
+            Peak Dynamic = 0.0168768 W
+            Subthreshold Leakage = 0.0020814 W
+            Gate Leakage = 1.24237e-05 W
+            Runtime Dynamic = 0.072481 W
+
+              Integer RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.0131657 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0706931 W
+
+              Floating Point RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.00371113 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0017722 W
+
+              Register Windows:
+                Area = 0.0118221 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00118307 W
+                Gate Leakage = 5.76149e-06 W
+                Runtime Dynamic = 1.56951e-05 W
+
+          Instruction Scheduler:
+            Area = 0.00263062 mm^2
+            Peak Dynamic = 0.00540689 W
+            Subthreshold Leakage = 8.27524e-05 W
+            Gate Leakage = 9.38261e-07 W
+            Runtime Dynamic = 0.0464411 W
+
+              Instruction Window:
+                Area = 0.00263062 mm^2
+                Peak Dynamic = 0.00540689 W
+                Subthreshold Leakage = 8.27524e-05 W
+                Gate Leakage = 9.38261e-07 W
+                Runtime Dynamic = 0.0464411 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.0384544 mm^2
+            Peak Dynamic = 0.0946992 W
+            Subthreshold Leakage = 0.00667865 W
+            Gate Leakage = 6.39207e-05 W
+            Runtime Dynamic = 0.841771 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 0.0695899 mm^2
+            Peak Dynamic = 0.0157832 W
+            Subthreshold Leakage = 0.00302155 W
+            Gate Leakage = 2.89189e-05 W
+            Runtime Dynamic = 0.0315664 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.115363 mm^2
+            Peak Dynamic = 0.105221 W
+            Subthreshold Leakage = 0.020036 W
+            Gate Leakage = 0.000191762 W
+            Runtime Dynamic = 0.210443 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00445381 mm^2
+            Peak Dynamic = 0.192955 W
+            Subthreshold Leakage = 0.00406321 W
+            Gate Leakage = 3.88886e-05 W
+            Runtime Dynamic = 0.519078 W
+
+*****************************************************************************************
+L2
+      Area = 2.52394 mm^2
+      Peak Dynamic = 0.32978 W
+      Subthreshold Leakage = 0.139973 W
+      Gate Leakage = 0.00157395 W
+      Runtime Dynamic = 1.14063 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 57.033 mm^2
+      Peak Dynamic = 53.5219 W
+      Subthreshold Leakage = 4.27249 W
+      Gate Leakage = 0.050206 W
+      Runtime Dynamic = 70.9203 W
+
+*****************************************************************************************
+NOC
+      Area = 52.2524 mm^2
+      Peak Dynamic = 20.5798 W
+      Subthreshold Leakage = 9.50197 W
+      Gate Leakage = 0.119531 W
+      Runtime Dynamic = 2.56185 W
+
+      Router: 
+        Area = 0.578434 mm^2
+        Peak Dynamic = 0.184548 W
+        Subthreshold Leakage = 0.125515 W
+        Gate Leakage = 0.0016409 W
+        Runtime Dynamic = 1.32875 W
+
+            Virtual Channel Buffer:
+              Area = 0.159162 mm^2
+              Peak Dynamic = 0.00394081 W
+              Subthreshold Leakage = 0.000194478 W
+              Gate Leakage = 1.84946e-06 W
+              Runtime Dynamic = 0.0283738 W
+
+            Crossbar:
+              Area = 0.160976 mm^2
+              Peak Dynamic = 0.179891 W
+              Subthreshold Leakage = 0.12532 W
+              Gate Leakage = 0.00163905 W
+              Runtime Dynamic = 1.29522 W
+
+            Arbiter:
+              Peak Dynamic = 0.000716053 W
+              Subthreshold Leakage = 3.67148e-07 W
+              Gate Leakage = 3.86991e-09 W
+              Runtime Dynamic = 0.00515558 W
+
+      Per Router Links: 
+        Area = 0.238009 mm^2
+        Peak Dynamic = 0.137011 W
+        Subthreshold Leakage = 0.0229533 W
+        Gate Leakage = 0.000226773 W
+        Runtime Dynamic = 1.2331 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/T2 b/ext/mcpat/results/T2
new file mode 100644 (file)
index 0000000..e24701a
--- /dev/null
@@ -0,0 +1,321 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 65 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1400
+
+*****************************************************************************************
+Processor: 
+  Area = 277.068 mm^2
+  Peak Power = 71.8237 W
+  Total Leakage = 18.2234 W
+  Peak Dynamic = 53.6003 W
+  Subthreshold Leakage = 14.7124 W
+  Gate Leakage = 3.51096 W
+  Runtime Dynamic = 48.652 W
+
+  Total Cores: 8 cores 
+  Device Type= ITRS high performance device type
+    Area = 116.441 mm^2
+    Peak Dynamic = 28.0277 W
+    Subthreshold Leakage = 9.00023 W
+    Gate Leakage = 1.93139 W
+    Runtime Dynamic = 27.9237 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 85.0391 mm^2
+    Peak Dynamic = 9.87481 W
+    Subthreshold Leakage = 2.71188 W
+    Gate Leakage = 0.684324 W
+    Runtime Dynamic = 3.97632 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 11.6417 mm^2
+    Peak Dynamic = 5.32369 W
+    Subthreshold Leakage = 0.249885 W
+    Gate Leakage = 0.107486 W
+    Runtime Dynamic = 5.38275 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 9.56584 mm^2
+    Peak Dynamic = 1.07754 W
+    Subthreshold Leakage = 1.61961 W
+    Gate Leakage = 0.389994 W
+    Runtime Dynamic = 1.07754 W
+
+  Total MCs: 4 Memory Controllers 
+  Device Type= ITRS high performance device type
+    Area = 32.2777 mm^2
+    Peak Dynamic = 5.92507 W
+    Subthreshold Leakage = 0.559071 W
+    Gate Leakage = 0.10416 W
+    Runtime Dynamic = 7.93157 W
+
+  Total NIUs: 2 Network Interface Units 
+  Device Type= ITRS high performance device type
+    Area = 15.8633 mm^2
+    Peak Dynamic = 1.86482 W
+    Subthreshold Leakage = 0.357626 W
+    Gate Leakage = 0.183662 W
+    Runtime Dynamic = 1.30537 W
+
+  Total PCIes: 1 PCIe Controllers 
+  Device Type= ITRS high performance device type
+    Area = 6.24 mm^2
+    Peak Dynamic = 1.5067 W
+    Subthreshold Leakage = 0.214091 W
+    Gate Leakage = 0.109948 W
+    Runtime Dynamic = 1.05469 W
+
+*****************************************************************************************
+Core:
+      Area = 14.5551 mm^2
+      Peak Dynamic = 3.50346 W
+      Subthreshold Leakage = 1.12503 W
+      Gate Leakage = 0.241423 W
+      Runtime Dynamic = 27.9237 W
+
+      Instruction Fetch Unit:
+        Area = 2.75911 mm^2
+        Peak Dynamic = 0.817936 W
+        Subthreshold Leakage = 0.0912466 W
+        Gate Leakage = 0.0284483 W
+        Runtime Dynamic = 4.81754 W
+
+          Instruction Cache:
+            Area = 2.51671 mm^2
+            Peak Dynamic = 0.513783 W
+            Subthreshold Leakage = 0.062355 W
+            Gate Leakage = 0.0164185 W
+            Runtime Dynamic = 1.59033 W
+
+          Instruction Buffer:
+            Area = 0.0130935 mm^2
+            Peak Dynamic = 0.0100268 W
+            Subthreshold Leakage = 0.000434992 W
+            Gate Leakage = 6.02581e-05 W
+            Runtime Dynamic = 0.160429 W
+
+          Instruction Decoder:
+            Area = 0.0119193 mm^2
+            Peak Dynamic = 0.0892213 W
+            Subthreshold Leakage = 0.00298091 W
+            Gate Leakage = 0.000408973 W
+            Runtime Dynamic = 1.42754 W
+
+      Load Store Unit:
+        Area = 2.14252 mm^2
+        Peak Dynamic = 0.487978 W
+        Subthreshold Leakage = 0.0802768 W
+        Gate Leakage = 0.0247378 W
+        Runtime Dynamic = 10.9331 W
+
+          Data Cache:
+            Area = 0.52868 mm^2
+            Peak Dynamic = 0.0991646 W
+            Subthreshold Leakage = 0.0119043 W
+            Gate Leakage = 0.00145618 W
+            Runtime Dynamic = 0.1303 W
+
+          Load/Store Queue:
+            Area = 1.22144 mm^2
+            Peak Dynamic = 0.286361 W
+            Subthreshold Leakage = 0.0428969 W
+            Gate Leakage = 0.011721 W
+            Runtime Dynamic = 9.16355 W
+
+      Memory Management Unit:
+        Area = 1.1006 mm^2
+        Peak Dynamic = 0.399121 W
+        Subthreshold Leakage = 0.0527367 W
+        Gate Leakage = 0.0195353 W
+        Runtime Dynamic = 2.78316 W
+
+          Itlb:
+            Area = 0.293144 mm^2
+            Peak Dynamic = 0.0743045 W
+            Subthreshold Leakage = 0.00720086 W
+            Gate Leakage = 0.00218791 W
+            Runtime Dynamic = 0.594438 W
+
+          Dtlb:
+            Area = 0.590071 mm^2
+            Peak Dynamic = 0.0686851 W
+            Subthreshold Leakage = 0.0200602 W
+            Gate Leakage = 0.00578676 W
+            Runtime Dynamic = 0.549486 W
+
+      Execution Unit:
+        Area = 6.79584 mm^2
+        Peak Dynamic = 1.79843 W
+        Subthreshold Leakage = 0.610924 W
+        Gate Leakage = 0.116437 W
+        Runtime Dynamic = 9.38994 W
+
+          Register Files:
+            Area = 1.18037 mm^2
+            Peak Dynamic = 0.0639548 W
+            Subthreshold Leakage = 0.00981018 W
+            Gate Leakage = 0.00106415 W
+            Runtime Dynamic = 0.401933 W
+
+              Integer RF:
+                Area = 0.648931 mm^2
+                Peak Dynamic = 0.0485174 W
+                Subthreshold Leakage = 0.00196627 W
+                Gate Leakage = 0.000259389 W
+                Runtime Dynamic = 0.392074 W
+
+              Floating Point RF:
+                Area = 0.324465 mm^2
+                Peak Dynamic = 0.0154374 W
+                Subthreshold Leakage = 0.00196627 W
+                Gate Leakage = 0.000259389 W
+                Runtime Dynamic = 0.0098154 W
+
+              Register Windows:
+                Area = 0.206972 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00587765 W
+                Gate Leakage = 0.000545372 W
+                Runtime Dynamic = 4.40062e-05 W
+
+          Instruction Scheduler:
+            Area = 0.0458096 mm^2
+            Peak Dynamic = 0.0333897 W
+            Subthreshold Leakage = 0.000402487 W
+            Gate Leakage = 8.61395e-05 W
+            Runtime Dynamic = 0.287483 W
+
+              Instruction Window:
+                Area = 0.0458096 mm^2
+                Peak Dynamic = 0.0333897 W
+                Subthreshold Leakage = 0.000402487 W
+                Gate Leakage = 8.61395e-05 W
+                Runtime Dynamic = 0.287483 W
+
+          Integer ALUs (Count: 2 ):
+            Area = 0.448448 mm^2
+            Peak Dynamic = 0.425547 W
+            Subthreshold Leakage = 0.147955 W
+            Gate Leakage = 0.0266792 W
+            Runtime Dynamic = 3.78264 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 4.85979 mm^2
+            Peak Dynamic = 0.425547 W
+            Subthreshold Leakage = 0.400843 W
+            Gate Leakage = 0.07228 W
+            Runtime Dynamic = 0.0709246 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0440413 mm^2
+            Peak Dynamic = 0.481158 W
+            Subthreshold Leakage = 0.0264373 W
+            Gate Leakage = 0.00476717 W
+            Runtime Dynamic = 3.20772 W
+
+*****************************************************************************************
+L2
+      Area = 10.6299 mm^2
+      Peak Dynamic = 1.23435 W
+      Subthreshold Leakage = 0.338985 W
+      Gate Leakage = 0.0855405 W
+      Runtime Dynamic = 3.97632 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 1.45521 mm^2
+      Peak Dynamic = 0.665462 W
+      Subthreshold Leakage = 0.0312356 W
+      Gate Leakage = 0.0134358 W
+      Runtime Dynamic = 5.38275 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 8.06942 mm^2
+      Peak Dynamic = 1.48127 W
+      Subthreshold Leakage = 0.139768 W
+      Gate Leakage = 0.0260401 W
+      Runtime Dynamic = 7.93157 W
+
+      Front End Engine:
+        Area = 0.250458 mm^2
+        Peak Dynamic = 0.05883 W
+        Subthreshold Leakage = 0.0029079 W
+        Gate Leakage = 0.000455875 W
+        Runtime Dynamic = 0.298069 W
+
+      Transaction Engine:
+        Area = 2.66058 mm^2
+        Peak Dynamic = 0.6912 W
+        Subthreshold Leakage = 0.0465697 W
+        Gate Leakage = 0.00870562 W
+        Runtime Dynamic = 3.50205 W
+
+      PHY:
+        Area = 5.15838 mm^2
+        Peak Dynamic = 0.731237 W
+        Subthreshold Leakage = 0.0902901 W
+        Gate Leakage = 0.0168786 W
+        Runtime Dynamic = 4.13145 W
+
+*****************************************************************************************
+NIU:
+      Area = 7.93167 mm^2
+      Peak Dynamic = 0.93241 W
+      Subthreshold Leakage = 0.178813 W
+      Gate Leakage = 0.0918312 W
+      Runtime Dynamic = 0.652687 W
+
+*****************************************************************************************
+PCIe:
+      Area = 6.24 mm^2
+      Peak Dynamic = 1.5067 W
+      Subthreshold Leakage = 0.214091 W
+      Gate Leakage = 0.109948 W
+      Runtime Dynamic = 1.05469 W
+
+*****************************************************************************************
+NOC
+      Area = 9.56584 mm^2
+      Peak Dynamic = 1.07754 W
+      Subthreshold Leakage = 1.61961 W
+      Gate Leakage = 0.389994 W
+      Runtime Dynamic = 1.07754 W
+
+      Router: 
+        Area = 4.78292 mm^2
+        Peak Dynamic = 0.538772 W
+        Subthreshold Leakage = 0.809805 W
+        Gate Leakage = 0.194997 W
+        Runtime Dynamic = 1.07754 W
+
+            Virtual Channel Buffer:
+              Area = 0.827721 mm^2
+              Peak Dynamic = 0.0223838 W
+              Subthreshold Leakage = 0.00314985 W
+              Gate Leakage = 0.000413272 W
+              Runtime Dynamic = 0.0447677 W
+
+            Crossbar:
+              Area = 1.69589 mm^2
+              Peak Dynamic = 0.511174 W
+              Subthreshold Leakage = 0.806641 W
+              Gate Leakage = 0.194581 W
+              Runtime Dynamic = 1.02235 W
+
+            Arbiter:
+              Peak Dynamic = 0.00521447 W
+              Subthreshold Leakage = 1.42757e-05 W
+              Gate Leakage = 2.78294e-06 W
+              Runtime Dynamic = 0.0104289 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/Xeon_core b/ext/mcpat/results/Xeon_core
new file mode 100644 (file)
index 0000000..0cc9ae6
--- /dev/null
@@ -0,0 +1,341 @@
+McPAT (version 0.7 of May, 2010) is computing the target processor...
+
+McPAT (version 0.7 of May, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 65 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3400
+
+*****************************************************************************************
+Processor: 
+  Area = 417.445 mm^2
+  Peak Power = 142.148 W
+  Total Leakage = 55.8021 W
+  Peak Dynamic = 86.3458 W
+  Subthreshold Leakage = 52.785 W
+  Gate Leakage = 3.01712 W
+  Runtime Dynamic = 63.1851 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 133.278 mm^2
+    Peak Dynamic = 63.8414 W
+    Subthreshold Leakage = 32.4393 W
+    Gate Leakage = 2.72517 W
+    Runtime Dynamic = 41.616 W
+
+  Total L3s: 
+  Device Type= ITRS high performance device type
+    Area = 278.612 mm^2
+    Peak Dynamic = 6.11346 W
+    Subthreshold Leakage = 20.1995 W
+    Gate Leakage = 0.267752 W
+    Runtime Dynamic = 5.1782 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 5.5548 mm^2
+    Peak Dynamic = 16.3909 W
+    Subthreshold Leakage = 0.146229 W
+    Gate Leakage = 0.0241913 W
+    Runtime Dynamic = 16.3909 W
+
+*****************************************************************************************
+Core:
+      Area = 66.6389 mm^2
+      Peak Dynamic = 31.9207 W
+      Subthreshold Leakage = 16.2197 W
+      Gate Leakage = 1.36259 W
+      Runtime Dynamic = 41.616 W
+
+      Instruction Fetch Unit:
+        Area = 7.41271 mm^2
+        Peak Dynamic = 5.04492 W
+        Subthreshold Leakage = 1.26751 W
+        Gate Leakage = 0.09429 W
+        Runtime Dynamic = 5.39803 W
+
+          Instruction Cache:
+            Area = 2.44324 mm^2
+            Peak Dynamic = 1.42048 W
+            Subthreshold Leakage = 0.359444 W
+            Gate Leakage = 0.0187045 W
+            Runtime Dynamic = 2.13804 W
+
+          Branch Target Buffer:
+            Area = 0.729086 mm^2
+            Peak Dynamic = 0.161698 W
+            Subthreshold Leakage = 0.0616324 W
+            Gate Leakage = 0.00336254 W
+            Runtime Dynamic = 0.646794 W
+
+          Branch Predictor:
+            Area = 0.430961 mm^2
+            Peak Dynamic = 0.188469 W
+            Subthreshold Leakage = 0.0698834 W
+            Gate Leakage = 0.00415943 W
+            Runtime Dynamic = 0.166045 W
+
+              Global Predictor:
+                Area = 0.174771 mm^2
+                Peak Dynamic = 0.0633335 W
+                Subthreshold Leakage = 0.0274086 W
+                Gate Leakage = 0.00158249 W
+                Runtime Dynamic = 0.0633335 W
+
+              Local Predictor:
+                Area = 0.0735854 mm^2
+                Peak Dynamic = 0.0393754 W
+                Subthreshold Leakage = 0.0111166 W
+                Gate Leakage = 0.000721196 W
+                Runtime Dynamic = 0.0393754 W
+
+                Area = 0.0507308 mm^2
+                Peak Dynamic = 0.0258383 W
+                Subthreshold Leakage = 0.00749994 W
+                Gate Leakage = 0.000498805 W
+                Runtime Dynamic = 0.0258383 W
+
+              Chooser:
+                Area = 0.174771 mm^2
+                Peak Dynamic = 0.0633335 W
+                Subthreshold Leakage = 0.0274086 W
+                Gate Leakage = 0.00158249 W
+                Runtime Dynamic = 0.0633335 W
+
+              RAS:
+                Area = 0.0613744 mm^2
+                Peak Dynamic = 0.0224266 W
+                Subthreshold Leakage = 0.00394955 W
+                Gate Leakage = 0.000273252 W
+                Runtime Dynamic = 2.51602e-06 W
+
+          Instruction Buffer:
+            Area = 0.0684348 mm^2
+            Peak Dynamic = 0.704461 W
+            Subthreshold Leakage = 0.00411741 W
+            Gate Leakage = 0.000240288 W
+            Runtime Dynamic = 0.46964 W
+
+          Instruction Decoder:
+            Area = 3.73007 mm^2
+            Peak Dynamic = 1.97751 W
+            Subthreshold Leakage = 0.733056 W
+            Gate Leakage = 0.0575912 W
+            Runtime Dynamic = 1.97751 W
+
+      Renaming Unit:
+        Area = 1.82421 mm^2
+        Peak Dynamic = 2.76284 W
+        Subthreshold Leakage = 0.0765654 W
+        Gate Leakage = 0.0125478 W
+        Runtime Dynamic = 1.94438 W
+
+          Int Front End RAT:
+            Area = 0.875874 mm^2
+            Peak Dynamic = 1.249 W
+            Subthreshold Leakage = 0.0113878 W
+            Gate Leakage = 0.000693471 W
+            Runtime Dynamic = 1.249 W
+
+          FP Front End RAT:
+            Area = 0.405459 mm^2
+            Peak Dynamic = 0.610062 W
+            Subthreshold Leakage = 0.0144803 W
+            Gate Leakage = 0.000906674 W
+            Runtime Dynamic = 0.305031 W
+
+          Free List:
+            Area = 0.297629 mm^2
+            Peak Dynamic = 0.137664 W
+            Subthreshold Leakage = 0.0054316 W
+            Gate Leakage = 0.000326171 W
+            Runtime Dynamic = 0.275328 W
+
+          Int Retire RAT: 
+            Area = 0.0530903 mm^2
+            Peak Dynamic = 0.056222 W
+            Subthreshold Leakage = 0.00135314 W
+            Gate Leakage = 0.00011607 W
+            Runtime Dynamic = 0.056222 W
+
+          FP Retire RAT:
+            Area = 0.018828 mm^2
+            Peak Dynamic = 0.0186388 W
+            Subthreshold Leakage = 0.000788229 W
+            Gate Leakage = 6.41952e-05 W
+            Runtime Dynamic = 0.00931941 W
+
+          FP Free List:
+            Area = 0.162422 mm^2
+            Peak Dynamic = 0.0989385 W
+            Subthreshold Leakage = 0.00375181 W
+            Gate Leakage = 0.000209083 W
+            Runtime Dynamic = 0.0494693 W
+
+      Load Store Unit:
+        Area = 4.35998 mm^2
+        Peak Dynamic = 2.94939 W
+        Subthreshold Leakage = 0.208781 W
+        Gate Leakage = 0.0232213 W
+        Runtime Dynamic = 3.60184 W
+
+          Data Cache:
+            Area = 2.2051 mm^2
+            Peak Dynamic = 1.08067 W
+            Subthreshold Leakage = 0.0877157 W
+            Gate Leakage = 0.00573003 W
+            Runtime Dynamic = 2.30478 W
+
+          LoadQ:
+            Area = 0.637121 mm^2
+            Peak Dynamic = 0.551016 W
+            Subthreshold Leakage = 0.0283256 W
+            Gate Leakage = 0.00254841 W
+            Runtime Dynamic = 0.275508 W
+
+          StoreQ:
+            Area = 0.809965 mm^2
+            Peak Dynamic = 1.02155 W
+            Subthreshold Leakage = 0.053367 W
+            Gate Leakage = 0.00471074 W
+            Runtime Dynamic = 1.02155 W
+
+      Memory Management Unit:
+        Area = 0.517456 mm^2
+        Peak Dynamic = 0.979218 W
+        Subthreshold Leakage = 0.0808171 W
+        Gate Leakage = 0.0139952 W
+        Runtime Dynamic = 1.66678 W
+
+          Itlb:
+            Area = 0.127123 mm^2
+            Peak Dynamic = 0.236587 W
+            Subthreshold Leakage = 0.0160962 W
+            Gate Leakage = 0.00146431 W
+            Runtime Dynamic = 0.473177 W
+
+          Dtlb:
+            Area = 0.379422 mm^2
+            Peak Dynamic = 0.298399 W
+            Subthreshold Leakage = 0.0253484 W
+            Gate Leakage = 0.00229878 W
+            Runtime Dynamic = 1.1936 W
+
+      Execution Unit:
+        Area = 27.5381 mm^2
+        Peak Dynamic = 16.9637 W
+        Subthreshold Leakage = 7.08185 W
+        Gate Leakage = 0.73316 W
+        Runtime Dynamic = 22.7198 W
+
+          Register Files:
+            Area = 11.2548 mm^2
+            Peak Dynamic = 3.2925 W
+            Subthreshold Leakage = 0.11111 W
+            Gate Leakage = 0.00754256 W
+            Runtime Dynamic = 1.69823 W
+
+              Integer RF:
+                Area = 7.55916 mm^2
+                Peak Dynamic = 2.82012 W
+                Subthreshold Leakage = 0.0664048 W
+                Gate Leakage = 0.00458288 W
+                Runtime Dynamic = 1.51078 W
+
+              Floating Point RF:
+                Area = 3.69565 mm^2
+                Peak Dynamic = 0.472385 W
+                Subthreshold Leakage = 0.0447053 W
+                Gate Leakage = 0.00295968 W
+                Runtime Dynamic = 0.187454 W
+
+          Instruction Scheduler:
+            Area = 2.08681 mm^2
+            Peak Dynamic = 2.1684 W
+            Subthreshold Leakage = 0.0325294 W
+            Gate Leakage = 0.00296372 W
+            Runtime Dynamic = 2.59089 W
+
+              Instruction Window:
+                Area = 0.287309 mm^2
+                Peak Dynamic = 0.929972 W
+                Subthreshold Leakage = 0.0127376 W
+                Gate Leakage = 0.00137073 W
+                Runtime Dynamic = 1.2089 W
+
+              FP Instruction Window:
+                Area = 0.128977 mm^2
+                Peak Dynamic = 0.478661 W
+                Subthreshold Leakage = 0.00802287 W
+                Gate Leakage = 0.000873414 W
+                Runtime Dynamic = 0.622222 W
+
+              ROB:
+                Area = 1.67052 mm^2
+                Peak Dynamic = 0.759764 W
+                Subthreshold Leakage = 0.0117689 W
+                Gate Leakage = 0.000719579 W
+                Runtime Dynamic = 0.759764 W
+
+          Integer ALUs (Count: 6 ):
+            Area = 4.03603 mm^2
+            Peak Dynamic = 4.55818 W
+            Subthreshold Leakage = 3.9898 W
+            Gate Leakage = 0.412015 W
+            Runtime Dynamic = 2.33394 W
+
+          Floating Point Units (FPUs) (Count: 2 ):
+            Area = 9.71959 mm^2
+            Peak Dynamic = 1.43327 W
+            Subthreshold Leakage = 2.40207 W
+            Gate Leakage = 0.248054 W
+            Runtime Dynamic = 2.55333 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.336336 mm^2
+            Peak Dynamic = 0.510666 W
+            Subthreshold Leakage = 0.332484 W
+            Gate Leakage = 0.0343346 W
+            Runtime Dynamic = 3.18505 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0936618 mm^2
+            Peak Dynamic = 4.4084 W
+            Subthreshold Leakage = 0.174486 W
+            Gate Leakage = 0.0180186 W
+            Runtime Dynamic = 10.3584 W
+
+    L2
+    Area = 15.914 mm^2
+    Peak Dynamic = 3.22061 W
+    Subthreshold Leakage = 3.01991 W
+    Gate Leakage = 0.0223008 W
+    Runtime Dynamic = 6.28514 W
+
+*****************************************************************************************
+      L3
+      Area = 278.612 mm^2
+      Peak Dynamic = 6.11346 W
+      Subthreshold Leakage = 20.1995 W
+      Gate Leakage = 0.267752 W
+      Runtime Dynamic = 5.1782 W
+
+*****************************************************************************************
+BUSES
+      Area = 5.5548 mm^2
+      Peak Dynamic = 16.3909 W
+      Subthreshold Leakage = 0.146229 W
+      Gate Leakage = 0.0241913 W
+      Runtime Dynamic = 16.3909 W
+
+      Bus: 
+        Area = 5.5548 mm^2
+        Peak Dynamic = 16.3909 W
+        Subthreshold Leakage = 0.146229 W
+        Gate Leakage = 0.0241913 W
+        Runtime Dynamic = 16.3909 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/results/Xeon_uncore b/ext/mcpat/results/Xeon_uncore
new file mode 100644 (file)
index 0000000..558331c
--- /dev/null
@@ -0,0 +1,341 @@
+McPAT (version 0.7 of May, 2010) is computing the target processor...
+
+McPAT (version 0.7 of May, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 65 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3400
+
+*****************************************************************************************
+Processor: 
+  Area = 418.629 mm^2
+  Peak Power = 96.2032 W
+  Total Leakage = 27.5568 W
+  Peak Dynamic = 68.6463 W
+  Subthreshold Leakage = 25.8287 W
+  Gate Leakage = 1.72809 W
+  Runtime Dynamic = 50.332 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 134.217 mm^2
+    Peak Dynamic = 50.8677 W
+    Subthreshold Leakage = 15.0187 W
+    Gate Leakage = 1.57092 W
+    Runtime Dynamic = 33.3003 W
+
+  Total L3s: 
+  Device Type= ITRS high performance device type
+    Area = 278.843 mm^2
+    Peak Dynamic = 4.84476 W
+    Subthreshold Leakage = 10.7416 W
+    Gate Leakage = 0.144361 W
+    Runtime Dynamic = 4.09781 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 5.56828 mm^2
+    Peak Dynamic = 12.9339 W
+    Subthreshold Leakage = 0.0684953 W
+    Gate Leakage = 0.0128043 W
+    Runtime Dynamic = 12.9339 W
+
+*****************************************************************************************
+Core:
+      Area = 67.1085 mm^2
+      Peak Dynamic = 25.4338 W
+      Subthreshold Leakage = 7.50933 W
+      Gate Leakage = 0.78546 W
+      Runtime Dynamic = 33.3003 W
+
+      Instruction Fetch Unit:
+        Area = 7.56843 mm^2
+        Peak Dynamic = 4.27305 W
+        Subthreshold Leakage = 0.571346 W
+        Gate Leakage = 0.0523885 W
+        Runtime Dynamic = 4.67953 W
+
+          Instruction Cache:
+            Area = 2.44678 mm^2
+            Peak Dynamic = 1.1785 W
+            Subthreshold Leakage = 0.151766 W
+            Gate Leakage = 0.009764 W
+            Runtime Dynamic = 1.7926 W
+
+          Branch Target Buffer:
+            Area = 0.718635 mm^2
+            Peak Dynamic = 0.151619 W
+            Subthreshold Leakage = 0.0238082 W
+            Gate Leakage = 0.0015503 W
+            Runtime Dynamic = 0.606475 W
+
+          Branch Predictor:
+            Area = 0.446844 mm^2
+            Peak Dynamic = 0.158508 W
+            Subthreshold Leakage = 0.0293041 W
+            Gate Leakage = 0.0021362 W
+            Runtime Dynamic = 0.14087 W
+
+              Global Predictor:
+                Area = 0.174801 mm^2
+                Peak Dynamic = 0.0543932 W
+                Subthreshold Leakage = 0.0116121 W
+                Gate Leakage = 0.000827171 W
+                Runtime Dynamic = 0.0543932 W
+
+              Local Predictor:
+                Area = 0.0788692 mm^2
+                Peak Dynamic = 0.0320817 W
+                Subthreshold Leakage = 0.00452837 W
+                Gate Leakage = 0.000354718 W
+                Runtime Dynamic = 0.0320817 W
+
+                Area = 0.050748 mm^2
+                Peak Dynamic = 0.0218669 W
+                Subthreshold Leakage = 0.00318852 W
+                Gate Leakage = 0.000264126 W
+                Runtime Dynamic = 0.0218669 W
+
+              Chooser:
+                Area = 0.174801 mm^2
+                Peak Dynamic = 0.0543932 W
+                Subthreshold Leakage = 0.0116121 W
+                Gate Leakage = 0.000827171 W
+                Runtime Dynamic = 0.0543932 W
+
+              RAS:
+                Area = 0.0929863 mm^2
+                Peak Dynamic = 0.0176394 W
+                Subthreshold Leakage = 0.00155163 W
+                Gate Leakage = 0.00012714 W
+                Runtime Dynamic = 1.96119e-06 W
+
+          Instruction Buffer:
+            Area = 0.0687233 mm^2
+            Peak Dynamic = 0.579633 W
+            Subthreshold Leakage = 0.00177049 W
+            Gate Leakage = 0.000129185 W
+            Runtime Dynamic = 0.386422 W
+
+          Instruction Decoder:
+            Area = 3.87654 mm^2
+            Peak Dynamic = 1.75316 W
+            Subthreshold Leakage = 0.348225 W
+            Gate Leakage = 0.0335628 W
+            Runtime Dynamic = 1.75316 W
+
+      Renaming Unit:
+        Area = 1.83366 mm^2
+        Peak Dynamic = 2.16025 W
+        Subthreshold Leakage = 0.0324638 W
+        Gate Leakage = 0.00648876 W
+        Runtime Dynamic = 1.53428 W
+
+          Int Front End RAT:
+            Area = 0.879521 mm^2
+            Peak Dynamic = 0.975897 W
+            Subthreshold Leakage = 0.00490782 W
+            Gate Leakage = 0.000372282 W
+            Runtime Dynamic = 0.975897 W
+
+          FP Front End RAT:
+            Area = 0.407642 mm^2
+            Peak Dynamic = 0.477469 W
+            Subthreshold Leakage = 0.00619591 W
+            Gate Leakage = 0.000483134 W
+            Runtime Dynamic = 0.238735 W
+
+          Free List:
+            Area = 0.300513 mm^2
+            Peak Dynamic = 0.112906 W
+            Subthreshold Leakage = 0.00233243 W
+            Gate Leakage = 0.000174984 W
+            Runtime Dynamic = 0.225813 W
+
+          Int Retire RAT: 
+            Area = 0.0534147 mm^2
+            Peak Dynamic = 0.0453154 W
+            Subthreshold Leakage = 0.00058142 W
+            Gate Leakage = 6.26682e-05 W
+            Runtime Dynamic = 0.0453154 W
+
+          FP Retire RAT:
+            Area = 0.018897 mm^2
+            Peak Dynamic = 0.0151716 W
+            Subthreshold Leakage = 0.000337803 W
+            Gate Leakage = 3.45545e-05 W
+            Runtime Dynamic = 0.00758578 W
+
+          FP Free List:
+            Area = 0.162758 mm^2
+            Peak Dynamic = 0.081858 W
+            Subthreshold Leakage = 0.00163685 W
+            Gate Leakage = 0.000115075 W
+            Runtime Dynamic = 0.040929 W
+
+      Load Store Unit:
+        Area = 4.4281 mm^2
+        Peak Dynamic = 2.34722 W
+        Subthreshold Leakage = 0.0896936 W
+        Gate Leakage = 0.0121845 W
+        Runtime Dynamic = 2.89901 W
+
+          Data Cache:
+            Area = 2.25853 mm^2
+            Peak Dynamic = 0.888323 W
+            Subthreshold Leakage = 0.0382167 W
+            Gate Leakage = 0.00311455 W
+            Runtime Dynamic = 1.88387 W
+
+          LoadQ:
+            Area = 0.638298 mm^2
+            Peak Dynamic = 0.435889 W
+            Subthreshold Leakage = 0.0121526 W
+            Gate Leakage = 0.00134375 W
+            Runtime Dynamic = 0.217944 W
+
+          StoreQ:
+            Area = 0.811765 mm^2
+            Peak Dynamic = 0.79719 W
+            Subthreshold Leakage = 0.0228527 W
+            Gate Leakage = 0.00248017 W
+            Runtime Dynamic = 0.79719 W
+
+      Memory Management Unit:
+        Area = 0.518866 mm^2
+        Peak Dynamic = 0.760463 W
+        Subthreshold Leakage = 0.0342246 W
+        Gate Leakage = 0.00722713 W
+        Runtime Dynamic = 1.31193 W
+
+          Itlb:
+            Area = 0.12744 mm^2
+            Peak Dynamic = 0.187517 W
+            Subthreshold Leakage = 0.00686539 W
+            Gate Leakage = 0.000767441 W
+            Runtime Dynamic = 0.375037 W
+
+          Dtlb:
+            Area = 0.380515 mm^2
+            Peak Dynamic = 0.234221 W
+            Subthreshold Leakage = 0.0108877 W
+            Gate Leakage = 0.00121362 W
+            Runtime Dynamic = 0.936886 W
+
+      Execution Unit:
+        Area = 27.5564 mm^2
+        Peak Dynamic = 13.34 W
+        Subthreshold Leakage = 3.35055 W
+        Gate Leakage = 0.425 W
+        Runtime Dynamic = 17.8618 W
+
+          Register Files:
+            Area = 11.2668 mm^2
+            Peak Dynamic = 2.65925 W
+            Subthreshold Leakage = 0.0472795 W
+            Gate Leakage = 0.00398463 W
+            Runtime Dynamic = 1.37147 W
+
+              Integer RF:
+                Area = 7.56635 mm^2
+                Peak Dynamic = 2.27672 W
+                Subthreshold Leakage = 0.0282472 W
+                Gate Leakage = 0.00241709 W
+                Runtime Dynamic = 1.21967 W
+
+              Floating Point RF:
+                Area = 3.70048 mm^2
+                Peak Dynamic = 0.382527 W
+                Subthreshold Leakage = 0.0190323 W
+                Gate Leakage = 0.00156754 W
+                Runtime Dynamic = 0.151797 W
+
+          Instruction Scheduler:
+            Area = 2.09118 mm^2
+            Peak Dynamic = 1.7092 W
+            Subthreshold Leakage = 0.0139125 W
+            Gate Leakage = 0.00156067 W
+            Runtime Dynamic = 2.04197 W
+
+              Instruction Window:
+                Area = 0.287606 mm^2
+                Peak Dynamic = 0.721714 W
+                Subthreshold Leakage = 0.00547415 W
+                Gate Leakage = 0.000721338 W
+                Runtime Dynamic = 0.940723 W
+
+              FP Instruction Window:
+                Area = 0.129287 mm^2
+                Peak Dynamic = 0.372875 W
+                Subthreshold Leakage = 0.0034355 W
+                Gate Leakage = 0.00045775 W
+                Runtime Dynamic = 0.486639 W
+
+              ROB:
+                Area = 1.67428 mm^2
+                Peak Dynamic = 0.61461 W
+                Subthreshold Leakage = 0.00500288 W
+                Gate Leakage = 0.00038158 W
+                Runtime Dynamic = 0.61461 W
+
+          Integer ALUs (Count: 6 ):
+            Area = 4.03603 mm^2
+            Peak Dynamic = 3.52986 W
+            Subthreshold Leakage = 1.89726 W
+            Gate Leakage = 0.240113 W
+            Runtime Dynamic = 1.8074 W
+
+          Floating Point Units (FPUs) (Count: 2 ):
+            Area = 9.71959 mm^2
+            Peak Dynamic = 1.10993 W
+            Subthreshold Leakage = 1.14225 W
+            Gate Leakage = 0.14456 W
+            Runtime Dynamic = 1.9773 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.336336 mm^2
+            Peak Dynamic = 0.405148 W
+            Subthreshold Leakage = 0.158105 W
+            Gate Leakage = 0.0200094 W
+            Runtime Dynamic = 2.4988 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0954831 mm^2
+            Peak Dynamic = 3.47499 W
+            Subthreshold Leakage = 0.0752739 W
+            Gate Leakage = 0.00952648 W
+            Runtime Dynamic = 8.1649 W
+
+    L2
+    Area = 16.1307 mm^2
+    Peak Dynamic = 2.55285 W
+    Subthreshold Leakage = 1.29868 W
+    Gate Leakage = 0.012304 W
+    Runtime Dynamic = 5.01368 W
+
+*****************************************************************************************
+      L3
+      Area = 278.843 mm^2
+      Peak Dynamic = 4.84476 W
+      Subthreshold Leakage = 10.7416 W
+      Gate Leakage = 0.144361 W
+      Runtime Dynamic = 4.09781 W
+
+*****************************************************************************************
+BUSES
+      Area = 5.56828 mm^2
+      Peak Dynamic = 12.9339 W
+      Subthreshold Leakage = 0.0684953 W
+      Gate Leakage = 0.0128043 W
+      Runtime Dynamic = 12.9339 W
+
+      Bus: 
+        Area = 5.56828 mm^2
+        Peak Dynamic = 12.9339 W
+        Subthreshold Leakage = 0.0684953 W
+        Gate Leakage = 0.0128043 W
+        Runtime Dynamic = 12.9339 W
+
+*****************************************************************************************
diff --git a/ext/mcpat/sharedcache.cc b/ext/mcpat/sharedcache.cc
new file mode 100644 (file)
index 0000000..3a61e1b
--- /dev/null
@@ -0,0 +1,1162 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+
+#include "XML_Parse.h"
+#include "arbiter.h"
+#include "array.h"
+#include "basic_circuit.h"
+#include "const.h"
+#include "io.h"
+#include "logic.h"
+#include "parameter.h"
+#include "sharedcache.h"
+
+SharedCache::SharedCache(ParseXML* XML_interface, int ithCache_, InputParameter* interface_ip_, enum cache_level cacheL_)
+:XML(XML_interface),
+ ithCache(ithCache_),
+ interface_ip(*interface_ip_),
+ cacheL(cacheL_),
+ dir_overhead(0)
+{
+  int idx;
+  int tag, data;
+  bool is_default, debug;
+  enum Device_ty device_t;
+  enum Core_type  core_t;
+  double size, line, assoc, banks;
+  if (cacheL==L2 && XML->sys.Private_L2)
+  {
+          device_t=Core_device;
+      core_t = (enum Core_type)XML->sys.core[ithCache].machine_type;
+  }
+  else
+  {
+          device_t=LLC_device;
+          core_t = Inorder;
+  }
+
+  debug           = false;
+  is_default=true;//indication for default setup
+  if (XML->sys.Embedded)
+                {
+                interface_ip.wt                  =Global_30;
+                interface_ip.wire_is_mat_type = 0;
+                interface_ip.wire_os_mat_type = 1;
+                }
+        else
+                {
+                interface_ip.wt                  =Global;
+                interface_ip.wire_is_mat_type = 2;
+                interface_ip.wire_os_mat_type = 2;
+                }
+  set_cache_param();
+
+  //All lower level cache are physically indexed and tagged.
+  size                             = cachep.capacity;
+  line                             = cachep.blockW;
+  assoc                            = cachep.assoc;
+  banks                            = cachep.nbanks;
+  if ((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))
+  {
+          assoc = 0;
+          tag   = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          interface_ip.num_search_ports    = 1;
+  }
+  else
+  {
+          idx                                                     = debug?9:int(ceil(log2(size/line/assoc)));
+          tag                                                     = debug?51:XML->sys.physical_address_width-idx-int(ceil(log2(line))) + EXTRA_TAG_BITS;
+          interface_ip.num_search_ports    = 0;
+          if (cachep.dir_ty==SBT)
+          {
+                  dir_overhead = ceil(XML->sys.number_of_cores/8.0)*8/(cachep.blockW*8);
+                  line = cachep.blockW*(1+ dir_overhead) ;
+                  size = cachep.capacity*(1+ dir_overhead);
+
+          }
+  }
+//  if (XML->sys.first_level_dir==2)
+//       tag += int(XML->sys.domain_size + 5);
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.cache_sz            = (int)size;
+  interface_ip.line_sz             = (int)line;
+  interface_ip.assoc               = (int)assoc;
+  interface_ip.nbanks              = (int)banks;
+  interface_ip.out_w               = interface_ip.line_sz*8/2;
+  interface_ip.access_mode         = 1;
+  interface_ip.throughput          = cachep.throughput;
+  interface_ip.latency             = cachep.latency;
+  interface_ip.is_cache                         = true;
+  interface_ip.pure_ram                         = false;
+  interface_ip.pure_cam          = false;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 1;//lower level cache usually has one port.
+  interface_ip.num_rd_ports        = 0;
+  interface_ip.num_wr_ports        = 0;
+  interface_ip.num_se_rd_ports     = 0;
+//  interface_ip.force_cache_config  =true;
+//  interface_ip.ndwl = 4;
+//  interface_ip.ndbl = 8;
+//  interface_ip.nspd = 1;
+//  interface_ip.ndcm =1 ;
+//  interface_ip.ndsam1 =1;
+//  interface_ip.ndsam2 =1;
+  unicache.caches = new ArrayST(&interface_ip, cachep.name + "cache", device_t, true, core_t);
+  unicache.area.set_area(unicache.area.get_area()+ unicache.caches->local_result.area);
+  area.set_area(area.get_area()+ unicache.caches->local_result.area);
+  interface_ip.force_cache_config  =false;
+
+  if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)))
+  {
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + unicache.caches->l_ip.line_sz;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+          interface_ip.cache_sz            = cachep.missb_size*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.is_cache                           = true;
+          interface_ip.pure_ram                           = false;
+          interface_ip.pure_cam            = false;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8/2;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = cachep.throughput;//means cycle time
+          interface_ip.latency             = cachep.latency;//means access time
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          interface_ip.num_search_ports    = 1;
+          unicache.missb = new ArrayST(&interface_ip, cachep.name + "MissB", device_t, true, core_t);
+          unicache.area.set_area(unicache.area.get_area()+ unicache.missb->local_result.area);
+          area.set_area(area.get_area()+ unicache.missb->local_result.area);
+          //fill buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = unicache.caches->l_ip.line_sz;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+          interface_ip.cache_sz            = data*cachep.fu_size ;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8/2;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          =  cachep.throughput;
+          interface_ip.latency             =  cachep.latency;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          unicache.ifb = new ArrayST(&interface_ip, cachep.name + "FillB", device_t, true, core_t);
+          unicache.area.set_area(unicache.area.get_area()+ unicache.ifb->local_result.area);
+          area.set_area(area.get_area()+ unicache.ifb->local_result.area);
+          //prefetch buffer
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge.
+          data                                                    = unicache.caches->l_ip.line_sz;//separate queue to prevent from cache polution.
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+          interface_ip.cache_sz            = cachep.prefetchb_size*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8/2;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = cachep.throughput;
+          interface_ip.latency             = cachep.latency;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          unicache.prefetchb = new ArrayST(&interface_ip, cachep.name + "PrefetchB", device_t, true, core_t);
+          unicache.area.set_area(unicache.area.get_area()+ unicache.prefetchb->local_result.area);
+          area.set_area(area.get_area()+ unicache.prefetchb->local_result.area);
+          //WBB
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = unicache.caches->l_ip.line_sz;
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;
+          interface_ip.cache_sz            = cachep.wbb_size*interface_ip.line_sz;
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1;
+          interface_ip.out_w               = interface_ip.line_sz*8/2;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = cachep.throughput;
+          interface_ip.latency             = cachep.latency;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          unicache.wbb = new ArrayST(&interface_ip, cachep.name + "WBB", device_t, true, core_t);
+          unicache.area.set_area(unicache.area.get_area()+ unicache.wbb->local_result.area);
+          area.set_area(area.get_area()+ unicache.wbb->local_result.area);
+  }
+  //  //pipeline
+//  interface_ip.pipeline_stages = int(ceil(llCache.caches.local_result.access_time/llCache.caches.local_result.cycle_time));
+//  interface_ip.per_stage_vector = llCache.caches.l_ip.out_w + llCache.caches.l_ip.tag_w ;
+//  pipeLogicCache.init_pipeline(is_default, &interface_ip);
+//  pipeLogicCache.compute_pipeline();
+
+  /*
+  if (!((XML->sys.number_of_dir_levels==1 && XML->sys.first_level_dir ==1)
+                  ||(XML->sys.number_of_dir_levels==1 && XML->sys.first_level_dir ==2)))//not single level IC and DIC
+  {
+  //directory Now assuming one directory per bank, TODO:should change it later
+  size                             = XML->sys.L2directory.L2Dir_config[0];
+  line                             = XML->sys.L2directory.L2Dir_config[1];
+  assoc                            = XML->sys.L2directory.L2Dir_config[2];
+  banks                            = XML->sys.L2directory.L2Dir_config[3];
+  tag                                                     = debug?51:XML->sys.physical_address_width + EXTRA_TAG_BITS;//TODO: a little bit over estimate
+  interface_ip.specific_tag        = 0;
+  interface_ip.tag_w               = tag;
+  interface_ip.cache_sz            = XML->sys.L2directory.L2Dir_config[0];
+  interface_ip.line_sz             = XML->sys.L2directory.L2Dir_config[1];
+  interface_ip.assoc               = XML->sys.L2directory.L2Dir_config[2];
+  interface_ip.nbanks              = XML->sys.L2directory.L2Dir_config[3];
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5];
+  interface_ip.throughput          = XML->sys.L2directory.L2Dir_config[4]/clockRate;
+  interface_ip.latency             = XML->sys.L2directory.L2Dir_config[5]/clockRate;
+  interface_ip.is_cache                         = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;//lower level cache usually has one port.
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+
+  strcpy(directory.caches.name,"L2 Directory");
+  directory.caches.init_cache(&interface_ip);
+  directory.caches.optimize_array();
+  directory.area += directory.caches.local_result.area;
+  //output_data_csv(directory.caches.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //miss buffer Each MSHR contains enough state to handle one or more accesses of any type to a single memory line.
+  //Due to the generality of the MSHR mechanism, the amount of state involved is non-trivial,
+  //including the address, pointers to the cache entry and destination register, written data, and various other pieces of state.
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data                                                    = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + directory.caches.l_ip.line_sz;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz            = XML->sys.L2[ithCache].buffer_sizes[0]*interface_ip.line_sz;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;//means cycle time
+  interface_ip.latency             = XML->sys.L2[ithCache].L2_config[5]/clockRate;//means access time
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.missb.name,"directoryMissB");
+  directory.missb.init_cache(&interface_ip);
+  directory.missb.optimize_array();
+  directory.area += directory.missb.local_result.area;
+  //output_data_csv(directory.missb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //fill buffer
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data                                                    = directory.caches.l_ip.line_sz;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            = data*XML->sys.L2[ithCache].buffer_sizes[1];
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          =  XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.latency             =  XML->sys.L2[ithCache].L2_config[5]/clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.ifb.name,"directoryFillB");
+  directory.ifb.init_cache(&interface_ip);
+  directory.ifb.optimize_array();
+  directory.area += directory.ifb.local_result.area;
+  //output_data_csv(directory.ifb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //prefetch buffer
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge.
+  data                                                    = directory.caches.l_ip.line_sz;//separate queue to prevent from cache polution.
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            = XML->sys.L2[ithCache].buffer_sizes[2]*interface_ip.line_sz;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.latency             = XML->sys.L2[ithCache].L2_config[5]/clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.prefetchb.name,"directoryPrefetchB");
+  directory.prefetchb.init_cache(&interface_ip);
+  directory.prefetchb.optimize_array();
+  directory.area += directory.prefetchb.local_result.area;
+  //output_data_csv(directory.prefetchb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //WBB
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data                                                    = directory.caches.l_ip.line_sz;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;
+  interface_ip.cache_sz            = XML->sys.L2[ithCache].buffer_sizes[3]*interface_ip.line_sz;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.latency             = XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.wbb.name,"directoryWBB");
+  directory.wbb.init_cache(&interface_ip);
+  directory.wbb.optimize_array();
+  directory.area += directory.wbb.local_result.area;
+  }
+
+  if (XML->sys.number_of_dir_levels ==2 && XML->sys.first_level_dir==0)
+  {
+  //first level directory
+  size                             = XML->sys.L2directory.L2Dir_config[0]*XML->sys.domain_size/128;
+  line                             = int(ceil(XML->sys.domain_size/8.0));
+  assoc                            = XML->sys.L2directory.L2Dir_config[2];
+  banks                            = XML->sys.L2directory.L2Dir_config[3];
+  tag                                                     = debug?51:XML->sys.physical_address_width + EXTRA_TAG_BITS;//TODO: a little bit over estimate
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.cache_sz            = XML->sys.L2directory.L2Dir_config[0];
+  interface_ip.line_sz             = XML->sys.L2directory.L2Dir_config[1];
+  interface_ip.assoc               = XML->sys.L2directory.L2Dir_config[2];
+  interface_ip.nbanks              = XML->sys.L2directory.L2Dir_config[3];
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5];
+  interface_ip.throughput          = XML->sys.L2directory.L2Dir_config[4]/clockRate;
+  interface_ip.latency             = XML->sys.L2directory.L2Dir_config[5]/clockRate;
+  interface_ip.is_cache                         = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;//lower level cache usually has one port.
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+
+  strcpy(directory1.caches.name,"first level Directory");
+  directory1.caches.init_cache(&interface_ip);
+  directory1.caches.optimize_array();
+  directory1.area += directory1.caches.local_result.area;
+  //output_data_csv(directory.caches.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //miss buffer Each MSHR contains enough state to handle one or more accesses of any type to a single memory line.
+  //Due to the generality of the MSHR mechanism, the amount of state involved is non-trivial,
+  //including the address, pointers to the cache entry and destination register, written data, and various other pieces of state.
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data                                                    = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + directory1.caches.l_ip.line_sz;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz            = XML->sys.L2[ithCache].buffer_sizes[0]*interface_ip.line_sz;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;//means cycle time
+  interface_ip.latency             = XML->sys.L2[ithCache].L2_config[5]/clockRate;//means access time
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.missb.name,"directory1MissB");
+  directory1.missb.init_cache(&interface_ip);
+  directory1.missb.optimize_array();
+  directory1.area += directory1.missb.local_result.area;
+  //output_data_csv(directory.missb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //fill buffer
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data                                                    = directory1.caches.l_ip.line_sz;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            = data*XML->sys.L2[ithCache].buffer_sizes[1];
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          =  XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.latency             =  XML->sys.L2[ithCache].L2_config[5]/clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.ifb.name,"directory1FillB");
+  directory1.ifb.init_cache(&interface_ip);
+  directory1.ifb.optimize_array();
+  directory1.area += directory1.ifb.local_result.area;
+  //output_data_csv(directory.ifb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //prefetch buffer
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge.
+  data                                                    = directory1.caches.l_ip.line_sz;//separate queue to prevent from cache polution.
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            = XML->sys.L2[ithCache].buffer_sizes[2]*interface_ip.line_sz;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.latency             = XML->sys.L2[ithCache].L2_config[5]/clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.prefetchb.name,"directory1PrefetchB");
+  directory1.prefetchb.init_cache(&interface_ip);
+  directory1.prefetchb.optimize_array();
+  directory1.area += directory1.prefetchb.local_result.area;
+  //output_data_csv(directory.prefetchb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //WBB
+  tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data                                                    = directory1.caches.l_ip.line_sz;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;
+  interface_ip.cache_sz            = XML->sys.L2[ithCache].buffer_sizes[3]*interface_ip.line_sz;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;
+  interface_ip.latency             = XML->sys.L2[ithCache].L2_config[5]/clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.wbb.name,"directoryWBB");
+  directory1.wbb.init_cache(&interface_ip);
+  directory1.wbb.optimize_array();
+  directory1.area += directory1.wbb.local_result.area;
+  }
+
+  if (XML->sys.first_level_dir==1)//IC
+  {
+          tag                                                     = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+          data                                                    = int(ceil(XML->sys.domain_size/8.0));
+          interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;
+          interface_ip.cache_sz            = XML->sys.domain_size*data*XML->sys.L2[ithCache].L2_config[0]/XML->sys.L2[ithCache].L2_config[1];
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1024;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          = XML->sys.L2[ithCache].L2_config[4]/clockRate;
+          interface_ip.latency             = XML->sys.L2[ithCache].L2_config[5]/clockRate;
+          interface_ip.obj_func_dyn_energy = 0;
+          interface_ip.obj_func_dyn_power  = 0;
+          interface_ip.obj_func_leak_power = 0;
+          interface_ip.obj_func_cycle_t    = 1;
+          interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          strcpy(inv_dir.caches.name,"inv_dir");
+          inv_dir.caches.init_cache(&interface_ip);
+          inv_dir.caches.optimize_array();
+          inv_dir.area = inv_dir.caches.local_result.area;
+
+  }
+*/
+//  //pipeline
+//  interface_ip.pipeline_stages = int(ceil(directory.caches.local_result.access_time/directory.caches.local_result.cycle_time));
+//  interface_ip.per_stage_vector = directory.caches.l_ip.out_w + directory.caches.l_ip.tag_w ;
+//  pipeLogicDirectory.init_pipeline(is_default, &interface_ip);
+//  pipeLogicDirectory.compute_pipeline();
+//
+//  //clock power
+//  clockNetwork.init_wire_external(is_default, &interface_ip);
+//  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
+//  clockNetwork.end_wiring_level   =5;//toplevel metal
+//  clockNetwork.start_wiring_level =5;//toplevel metal
+//  clockNetwork.num_regs           = pipeLogicCache.tot_stage_vector + pipeLogicDirectory.tot_stage_vector;
+//  clockNetwork.optimize_wire();
+
+}
+
+
+void SharedCache::computeEnergy(bool is_tdp)
+{
+        double homenode_data_access = (cachep.dir_ty==SBT)? 0.9:1.0;
+        if (is_tdp)
+        {
+                if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)))
+                {
+                        //init stats for Peak
+                        unicache.caches->stats_t.readAc.access  = .67*unicache.caches->l_ip.num_rw_ports*cachep.duty_cycle*homenode_data_access;
+                        unicache.caches->stats_t.readAc.miss    = 0;
+                        unicache.caches->stats_t.readAc.hit     = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss;
+                        unicache.caches->stats_t.writeAc.access = .33*unicache.caches->l_ip.num_rw_ports*cachep.duty_cycle*homenode_data_access;
+                        unicache.caches->stats_t.writeAc.miss   = 0;
+                        unicache.caches->stats_t.writeAc.hit    = unicache.caches->stats_t.writeAc.access -    unicache.caches->stats_t.writeAc.miss;
+                        unicache.caches->tdp_stats = unicache.caches->stats_t;
+
+                        if (cachep.dir_ty==SBT)
+                        {
+                                homenode_stats_t.readAc.access  = .67*unicache.caches->l_ip.num_rw_ports*cachep.dir_duty_cycle*(1-homenode_data_access);
+                                homenode_stats_t.readAc.miss    = 0;
+                                homenode_stats_t.readAc.hit     = homenode_stats_t.readAc.access - homenode_stats_t.readAc.miss;
+                                homenode_stats_t.writeAc.access  = .67*unicache.caches->l_ip.num_rw_ports*cachep.dir_duty_cycle*(1-homenode_data_access);
+                                homenode_stats_t.writeAc.miss   = 0;
+                                homenode_stats_t.writeAc.hit    = homenode_stats_t.writeAc.access -    homenode_stats_t.writeAc.miss;
+                                homenode_tdp_stats = homenode_stats_t;
+                        }
+
+                        unicache.missb->stats_t.readAc.access  = unicache.missb->l_ip.num_search_ports;
+                        unicache.missb->stats_t.writeAc.access = unicache.missb->l_ip.num_search_ports;
+                        unicache.missb->tdp_stats = unicache.missb->stats_t;
+
+                        unicache.ifb->stats_t.readAc.access  = unicache.ifb->l_ip.num_search_ports;
+                        unicache.ifb->stats_t.writeAc.access = unicache.ifb->l_ip.num_search_ports;
+                        unicache.ifb->tdp_stats = unicache.ifb->stats_t;
+
+                        unicache.prefetchb->stats_t.readAc.access  = unicache.prefetchb->l_ip.num_search_ports;
+                        unicache.prefetchb->stats_t.writeAc.access = unicache.ifb->l_ip.num_search_ports;
+                        unicache.prefetchb->tdp_stats = unicache.prefetchb->stats_t;
+
+                        unicache.wbb->stats_t.readAc.access  = unicache.wbb->l_ip.num_search_ports;
+                        unicache.wbb->stats_t.writeAc.access = unicache.wbb->l_ip.num_search_ports;
+                        unicache.wbb->tdp_stats = unicache.wbb->stats_t;
+                }
+                else
+                {
+                        unicache.caches->stats_t.readAc.access  = unicache.caches->l_ip.num_search_ports*cachep.duty_cycle;
+                        unicache.caches->stats_t.readAc.miss    = 0;
+                        unicache.caches->stats_t.readAc.hit     = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss;
+                        unicache.caches->stats_t.writeAc.access = 0;
+                        unicache.caches->stats_t.writeAc.miss   = 0;
+                        unicache.caches->stats_t.writeAc.hit    = unicache.caches->stats_t.writeAc.access -    unicache.caches->stats_t.writeAc.miss;
+                        unicache.caches->tdp_stats = unicache.caches->stats_t;
+
+                }
+
+        }
+        else
+        {
+                //init stats for runtime power (RTP)
+                if (cacheL==L2)
+                {
+                        unicache.caches->stats_t.readAc.access  = XML->sys.L2[ithCache].read_accesses;
+                        unicache.caches->stats_t.readAc.miss    = XML->sys.L2[ithCache].read_misses;
+                        unicache.caches->stats_t.readAc.hit     = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss;
+                        unicache.caches->stats_t.writeAc.access = XML->sys.L2[ithCache].write_accesses;
+                        unicache.caches->stats_t.writeAc.miss   = XML->sys.L2[ithCache].write_misses;
+                        unicache.caches->stats_t.writeAc.hit    = unicache.caches->stats_t.writeAc.access -    unicache.caches->stats_t.writeAc.miss;
+                        unicache.caches->rtp_stats = unicache.caches->stats_t;
+
+                        if (cachep.dir_ty==SBT)
+                        {
+                                homenode_rtp_stats.readAc.access  = XML->sys.L2[ithCache].homenode_read_accesses;
+                                homenode_rtp_stats.readAc.miss    = XML->sys.L2[ithCache].homenode_read_misses;
+                                homenode_rtp_stats.readAc.hit     = homenode_rtp_stats.readAc.access - homenode_rtp_stats.readAc.miss;
+                                homenode_rtp_stats.writeAc.access = XML->sys.L2[ithCache].homenode_write_accesses;
+                                homenode_rtp_stats.writeAc.miss   = XML->sys.L2[ithCache].homenode_write_misses;
+                                homenode_rtp_stats.writeAc.hit    = homenode_rtp_stats.writeAc.access -        homenode_rtp_stats.writeAc.miss;
+                        }
+                }
+                else if (cacheL==L3)
+                {
+                        unicache.caches->stats_t.readAc.access  = XML->sys.L3[ithCache].read_accesses;
+                        unicache.caches->stats_t.readAc.miss    = XML->sys.L3[ithCache].read_misses;
+                        unicache.caches->stats_t.readAc.hit     = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss;
+                        unicache.caches->stats_t.writeAc.access = XML->sys.L3[ithCache].write_accesses;
+                        unicache.caches->stats_t.writeAc.miss   = XML->sys.L3[ithCache].write_misses;
+                        unicache.caches->stats_t.writeAc.hit    = unicache.caches->stats_t.writeAc.access -    unicache.caches->stats_t.writeAc.miss;
+                        unicache.caches->rtp_stats = unicache.caches->stats_t;
+
+                        if (cachep.dir_ty==SBT)
+                        {
+                                homenode_rtp_stats.readAc.access  = XML->sys.L3[ithCache].homenode_read_accesses;
+                                homenode_rtp_stats.readAc.miss    = XML->sys.L3[ithCache].homenode_read_misses;
+                                homenode_rtp_stats.readAc.hit     = homenode_rtp_stats.readAc.access - homenode_rtp_stats.readAc.miss;
+                                homenode_rtp_stats.writeAc.access = XML->sys.L3[ithCache].homenode_write_accesses;
+                                homenode_rtp_stats.writeAc.miss   = XML->sys.L3[ithCache].homenode_write_misses;
+                                homenode_rtp_stats.writeAc.hit    = homenode_rtp_stats.writeAc.access -        homenode_rtp_stats.writeAc.miss;
+                        }
+                }
+                else if (cacheL==L1Directory)
+                {
+                        unicache.caches->stats_t.readAc.access  = XML->sys.L1Directory[ithCache].read_accesses;
+                        unicache.caches->stats_t.readAc.miss    = XML->sys.L1Directory[ithCache].read_misses;
+                        unicache.caches->stats_t.readAc.hit     = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss;
+                        unicache.caches->stats_t.writeAc.access = XML->sys.L1Directory[ithCache].write_accesses;
+                        unicache.caches->stats_t.writeAc.miss   = XML->sys.L1Directory[ithCache].write_misses;
+                        unicache.caches->stats_t.writeAc.hit    = unicache.caches->stats_t.writeAc.access -    unicache.caches->stats_t.writeAc.miss;
+                        unicache.caches->rtp_stats = unicache.caches->stats_t;
+                }
+                else if (cacheL==L2Directory)
+                {
+                        unicache.caches->stats_t.readAc.access  = XML->sys.L2Directory[ithCache].read_accesses;
+                        unicache.caches->stats_t.readAc.miss    = XML->sys.L2Directory[ithCache].read_misses;
+                        unicache.caches->stats_t.readAc.hit     = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss;
+                        unicache.caches->stats_t.writeAc.access = XML->sys.L2Directory[ithCache].write_accesses;
+                        unicache.caches->stats_t.writeAc.miss   = XML->sys.L2Directory[ithCache].write_misses;
+                        unicache.caches->stats_t.writeAc.hit    = unicache.caches->stats_t.writeAc.access -    unicache.caches->stats_t.writeAc.miss;
+                        unicache.caches->rtp_stats = unicache.caches->stats_t;
+                }
+                if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)))
+                {   //Assuming write back and write-allocate cache
+
+                        unicache.missb->stats_t.readAc.access  = unicache.caches->stats_t.writeAc.miss ;
+                        unicache.missb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss;
+                        unicache.missb->rtp_stats = unicache.missb->stats_t;
+
+                        unicache.ifb->stats_t.readAc.access  = unicache.caches->stats_t.writeAc.miss;
+                        unicache.ifb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss;
+                        unicache.ifb->rtp_stats = unicache.ifb->stats_t;
+
+                        unicache.prefetchb->stats_t.readAc.access  = unicache.caches->stats_t.writeAc.miss;
+                        unicache.prefetchb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss;
+                        unicache.prefetchb->rtp_stats = unicache.prefetchb->stats_t;
+
+                        unicache.wbb->stats_t.readAc.access  = unicache.caches->stats_t.writeAc.miss;
+                        unicache.wbb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss;
+                        if (cachep.dir_ty==SBT)
+                        {
+                                unicache.missb->stats_t.readAc.access  += homenode_rtp_stats.writeAc.miss;
+                                unicache.missb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+                                unicache.missb->rtp_stats = unicache.missb->stats_t;
+
+                                unicache.missb->stats_t.readAc.access  += homenode_rtp_stats.writeAc.miss;
+                                unicache.missb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+                                unicache.missb->rtp_stats = unicache.missb->stats_t;
+
+                                unicache.ifb->stats_t.readAc.access  += homenode_rtp_stats.writeAc.miss;
+                                unicache.ifb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+                                unicache.ifb->rtp_stats = unicache.ifb->stats_t;
+
+                                unicache.prefetchb->stats_t.readAc.access  += homenode_rtp_stats.writeAc.miss;
+                                unicache.prefetchb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+                                unicache.prefetchb->rtp_stats = unicache.prefetchb->stats_t;
+
+                                unicache.wbb->stats_t.readAc.access  += homenode_rtp_stats.writeAc.miss;
+                                unicache.wbb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+                        }
+                        unicache.wbb->rtp_stats = unicache.wbb->stats_t;
+
+                }
+
+        }
+
+        unicache.power_t.reset();
+        if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)))
+        {
+                unicache.power_t.readOp.dynamic        += (unicache.caches->stats_t.readAc.hit*unicache.caches->local_result.power.readOp.dynamic+
+                                unicache.caches->stats_t.readAc.miss*unicache.caches->local_result.tag_array2->power.readOp.dynamic+
+                                unicache.caches->stats_t.writeAc.miss*unicache.caches->local_result.tag_array2->power.writeOp.dynamic+
+                                unicache.caches->stats_t.writeAc.access*unicache.caches->local_result.power.writeOp.dynamic);//write miss will also generate a write later
+
+                if (cachep.dir_ty==SBT)
+                {
+                        unicache.power_t.readOp.dynamic        += homenode_stats_t.readAc.hit * (unicache.caches->local_result.data_array2->power.readOp.dynamic*dir_overhead +
+                                                unicache.caches->local_result.tag_array2->power.readOp.dynamic) +
+                                        homenode_stats_t.readAc.miss*unicache.caches->local_result.tag_array2->power.readOp.dynamic +
+                                        homenode_stats_t.writeAc.miss*unicache.caches->local_result.tag_array2->power.readOp.dynamic +
+                                homenode_stats_t.writeAc.hit*(unicache.caches->local_result.data_array2->power.writeOp.dynamic*dir_overhead +
+                                                        unicache.caches->local_result.tag_array2->power.readOp.dynamic+
+                                        homenode_stats_t.writeAc.miss*unicache.caches->local_result.power.writeOp.dynamic);//write miss on dynamic home node will generate a replacement write on whole cache block
+
+
+                }
+
+                unicache.power_t.readOp.dynamic        +=  unicache.missb->stats_t.readAc.access*unicache.missb->local_result.power.searchOp.dynamic +
+                unicache.missb->stats_t.writeAc.access*unicache.missb->local_result.power.writeOp.dynamic;//each access to missb involves a CAM and a write
+                unicache.power_t.readOp.dynamic        +=  unicache.ifb->stats_t.readAc.access*unicache.ifb->local_result.power.searchOp.dynamic +
+                unicache.ifb->stats_t.writeAc.access*unicache.ifb->local_result.power.writeOp.dynamic;
+                unicache.power_t.readOp.dynamic        +=  unicache.prefetchb->stats_t.readAc.access*unicache.prefetchb->local_result.power.searchOp.dynamic +
+                unicache.prefetchb->stats_t.writeAc.access*unicache.prefetchb->local_result.power.writeOp.dynamic;
+                unicache.power_t.readOp.dynamic        +=  unicache.wbb->stats_t.readAc.access*unicache.wbb->local_result.power.searchOp.dynamic +
+                unicache.wbb->stats_t.writeAc.access*unicache.wbb->local_result.power.writeOp.dynamic;
+        }
+        else
+        {
+                unicache.power_t.readOp.dynamic        += (unicache.caches->stats_t.readAc.access*unicache.caches->local_result.power.searchOp.dynamic+
+                                unicache.caches->stats_t.writeAc.access*unicache.caches->local_result.power.writeOp.dynamic);
+        }
+
+        if (is_tdp)
+        {
+                unicache.power = unicache.power_t + (unicache.caches->local_result.power)*pppm_lkg;
+                if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)))
+                {
+                        unicache.power = unicache.power+
+                        (unicache.missb->local_result.power +
+                                        unicache.ifb->local_result.power +
+                                        unicache.prefetchb->local_result.power +
+                                        unicache.wbb->local_result.power)*pppm_lkg;
+                }
+                power     = power + unicache.power;
+//             cout<<"unicache.caches->local_result.power.readOp.dynamic"<<unicache.caches->local_result.power.readOp.dynamic<<endl;
+//             cout<<"unicache.caches->local_result.power.writeOp.dynamic"<<unicache.caches->local_result.power.writeOp.dynamic<<endl;
+        }
+        else
+        {
+                unicache.rt_power = unicache.power_t + (unicache.caches->local_result.power)*pppm_lkg;
+                if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)))
+                {
+                        (unicache.rt_power = unicache.rt_power +
+                                        unicache.missb->local_result.power +
+                                        unicache.ifb->local_result.power +
+                                        unicache.prefetchb->local_result.power +
+                                        unicache.wbb->local_result.power)*pppm_lkg;
+                }
+                rt_power     = rt_power + unicache.rt_power;
+        }
+}
+
+void SharedCache::displayEnergy(uint32_t indent,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << (XML->sys.Private_L2? indent_str:"")<< cachep.name << endl;
+                cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*cachep.clockRate << " W" << endl;
+                cout << indent_str << "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/cachep.executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+        }
+}
+
+//void SharedCache::computeMaxPower()
+//{
+//  //Compute maximum power and runtime power.
+//  //When computing runtime power, McPAT gets or reasons out the statistics based on XML input.
+//  maxPower           = 0.0;
+//  //llCache,itlb
+//  llCache.maxPower   = 0.0;
+//  llCache.maxPower   +=  (llCache.caches.l_ip.num_rw_ports*(0.67*llCache.caches.local_result.power.readOp.dynamic+0.33*llCache.caches.local_result.power.writeOp.dynamic)
+//                        +llCache.caches.l_ip.num_rd_ports*llCache.caches.local_result.power.readOp.dynamic+llCache.caches.l_ip.num_wr_ports*llCache.caches.local_result.power.writeOp.dynamic
+//                        +llCache.caches.l_ip.num_se_rd_ports*llCache.caches.local_result.power.readOp.dynamic)*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower   +=  llCache.missb.l_ip.num_search_ports*llCache.missb.local_result.power.searchOp.dynamic*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower   +=  llCache.ifb.l_ip.num_search_ports*llCache.ifb.local_result.power.searchOp.dynamic*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower   +=  llCache.prefetchb.l_ip.num_search_ports*llCache.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower   +=  llCache.wbb.l_ip.num_search_ports*llCache.wbb.local_result.power.searchOp.dynamic*clockRate;
+//  //llCache.maxPower *=  scktRatio; //TODO: this calculation should be self-contained
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+////  directory_power =  (directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic)
+////                        +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic
+////                        +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate;
+//
+//  L2Tot.power.readOp.dynamic = llCache.maxPower;
+//  L2Tot.power.readOp.leakage = llCache.caches.local_result.power.readOp.leakage +
+//                               llCache.missb.local_result.power.readOp.leakage +
+//                               llCache.ifb.local_result.power.readOp.leakage +
+//                               llCache.prefetchb.local_result.power.readOp.leakage +
+//                               llCache.wbb.local_result.power.readOp.leakage;
+//
+//  L2Tot.area.set_area(llCache.area*1.1*1e-6);//placement and routing overhead
+//
+//  if (XML->sys.number_of_dir_levels==1)
+//  {
+//       if (XML->sys.first_level_dir==0)
+//       {
+//               directory.maxPower   = 0.0;
+//               directory.maxPower    +=  (directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic)
+//                                     +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic
+//                                     +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate;
+//               ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//               directory.maxPower    +=  directory.missb.l_ip.num_search_ports*directory.missb.local_result.power.searchOp.dynamic*clockRate;
+//               ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//               directory.maxPower    +=  directory.ifb.l_ip.num_search_ports*directory.ifb.local_result.power.searchOp.dynamic*clockRate;
+//               ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//               directory.maxPower    +=  directory.prefetchb.l_ip.num_search_ports*directory.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//               ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//               directory.maxPower    +=  directory.wbb.l_ip.num_search_ports*directory.wbb.local_result.power.searchOp.dynamic*clockRate;
+//
+//               cc.power.readOp.dynamic = directory.maxPower*scktRatio*8;//8 is the memory controller counts
+//               cc.power.readOp.leakage = directory.caches.local_result.power.readOp.leakage +
+//                                     directory.missb.local_result.power.readOp.leakage +
+//                                     directory.ifb.local_result.power.readOp.leakage +
+//                                     directory.prefetchb.local_result.power.readOp.leakage +
+//                                     directory.wbb.local_result.power.readOp.leakage;
+//
+//               cc.power.readOp.leakage *=8;
+//
+//               cc.area.set_area(directory.area*8);
+//               cout<<"CC area="<<cc.area.get_area()*1e-6<<endl;
+//               cout<<"CC Power="<<cc.power.readOp.dynamic<<endl;
+//               ccTot.area.set_area(cc.area.get_area()*1e-6);
+//               ccTot.power = cc.power;
+//               cout<<"DC energy per access" << cc.power.readOp.dynamic/clockRate/8;
+//       }
+//       else if (XML->sys.first_level_dir==1)
+//       {
+//               inv_dir.maxPower = inv_dir.caches.local_result.power.searchOp.dynamic*clockRate*XML->sys.domain_size;
+//               cc.power.readOp.dynamic  = inv_dir.maxPower*scktRatio*64/XML->sys.domain_size;
+//               cc.power.readOp.leakage  = inv_dir.caches.local_result.power.readOp.leakage*inv_dir.caches.l_ip.nbanks*64/XML->sys.domain_size;
+//
+//               cc.area.set_area(inv_dir.area*64/XML->sys.domain_size);
+//               cout<<"CC area="<<cc.area.get_area()*1e-6<<endl;
+//               cout<<"CC Power="<<cc.power.readOp.dynamic<<endl;
+//               ccTot.area.set_area(cc.area.get_area()*1e-6);
+//               cout<<"DC energy per access" << cc.power.readOp.dynamic/clockRate/8;
+//               ccTot.power = cc.power;
+//       }
+//  }
+//
+//  else if (XML->sys.number_of_dir_levels==2)
+//  {
+//
+//                       directory.maxPower   = 0.0;
+//                       directory.maxPower    +=  (directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic)
+//                                             +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic
+//                                             +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate;
+//                       ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//                       directory.maxPower    +=  directory.missb.l_ip.num_search_ports*directory.missb.local_result.power.searchOp.dynamic*clockRate;
+//                       ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//                       directory.maxPower    +=  directory.ifb.l_ip.num_search_ports*directory.ifb.local_result.power.searchOp.dynamic*clockRate;
+//                       ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//                       directory.maxPower    +=  directory.prefetchb.l_ip.num_search_ports*directory.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//                       ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//                       directory.maxPower    +=  directory.wbb.l_ip.num_search_ports*directory.wbb.local_result.power.searchOp.dynamic*clockRate;
+//
+//                       cc.power.readOp.dynamic = directory.maxPower*scktRatio*8;//8 is the memory controller counts
+//                       cc.power.readOp.leakage = directory.caches.local_result.power.readOp.leakage +
+//                                          directory.missb.local_result.power.readOp.leakage +
+//                                          directory.ifb.local_result.power.readOp.leakage +
+//                                          directory.prefetchb.local_result.power.readOp.leakage +
+//                                          directory.wbb.local_result.power.readOp.leakage;
+//                       cc.power.readOp.leakage *=8;
+//                       cc.area.set_area(directory.area*8);
+//
+//                     if (XML->sys.first_level_dir==0)
+//                     {
+//                       directory1.maxPower   = 0.0;
+//                       directory1.maxPower   +=  (directory1.caches.l_ip.num_rw_ports*(0.67*directory1.caches.local_result.power.readOp.dynamic+0.33*directory1.caches.local_result.power.writeOp.dynamic)
+//                                       +directory1.caches.l_ip.num_rd_ports*directory1.caches.local_result.power.readOp.dynamic+directory1.caches.l_ip.num_wr_ports*directory1.caches.local_result.power.writeOp.dynamic
+//                                       +directory1.caches.l_ip.num_se_rd_ports*directory1.caches.local_result.power.readOp.dynamic)*clockRate;
+//                       ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl;
+//
+//                       directory1.maxPower   +=  directory1.missb.l_ip.num_search_ports*directory1.missb.local_result.power.searchOp.dynamic*clockRate;
+//                       ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl;
+//
+//                       directory1.maxPower   +=  directory1.ifb.l_ip.num_search_ports*directory1.ifb.local_result.power.searchOp.dynamic*clockRate;
+//                       ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl;
+//
+//                       directory1.maxPower   +=  directory1.prefetchb.l_ip.num_search_ports*directory1.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//                       ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl;
+//
+//                       directory1.maxPower   +=  directory1.wbb.l_ip.num_search_ports*directory1.wbb.local_result.power.searchOp.dynamic*clockRate;
+//
+//                       cc1.power.readOp.dynamic = directory1.maxPower*scktRatio*64/XML->sys.domain_size;
+//                       cc1.power.readOp.leakage = directory1.caches.local_result.power.readOp.leakage +
+//                                          directory1.missb.local_result.power.readOp.leakage +
+//                                          directory1.ifb.local_result.power.readOp.leakage +
+//                                          directory1.prefetchb.local_result.power.readOp.leakage +
+//                                          directory1.wbb.local_result.power.readOp.leakage;
+//                       cc1.power.readOp.leakage *= 64/XML->sys.domain_size;
+//                       cc1.area.set_area(directory1.area*64/XML->sys.domain_size);
+//
+//                       cout<<"CC area="<<(cc.area.get_area()+cc1.area.get_area())*1e-6<<endl;
+//                       cout<<"CC Power="<<cc.power.readOp.dynamic + cc1.power.readOp.dynamic <<endl;
+//                       ccTot.area.set_area((cc.area.get_area()+cc1.area.get_area())*1e-6);
+//                       ccTot.power = cc.power + cc1.power;
+//               }
+//               else if (XML->sys.first_level_dir==1)
+//               {
+//                       inv_dir.maxPower = inv_dir.caches.local_result.power.searchOp.dynamic*clockRate*XML->sys.domain_size;
+//                       cc1.power.readOp.dynamic = inv_dir.maxPower*scktRatio*(64/XML->sys.domain_size);
+//                       cc1.power.readOp.leakage  = inv_dir.caches.local_result.power.readOp.leakage*inv_dir.caches.l_ip.nbanks*XML->sys.domain_size;
+//
+//                       cc1.area.set_area(inv_dir.area*64/XML->sys.domain_size);
+//                       cout<<"CC area="<<(cc.area.get_area()+cc1.area.get_area())*1e-6<<endl;
+//                       cout<<"CC Power="<<cc.power.readOp.dynamic + cc1.power.readOp.dynamic <<endl;
+//                       ccTot.area.set_area((cc.area.get_area()+cc1.area.get_area())*1e-6);
+//                       ccTot.power = cc.power + cc1.power;
+//
+//               }
+//               else if (XML->sys.first_level_dir==2)
+//               {
+//                       cout<<"CC area="<<cc.area.get_area()*1e-6<<endl;
+//                       cout<<"CC Power="<<cc.power.readOp.dynamic<<endl;
+//                       ccTot.area.set_area(cc.area.get_area()*1e-6);
+//                       ccTot.power = cc.power;
+//               }
+//  }
+//
+//cout<<"L2cache size="<<L2Tot.area.get_area()*1e-6<<endl;
+//cout<<"L2cache dynamic power="<<L2Tot.power.readOp.dynamic<<endl;
+//cout<<"L2cache laeakge power="<<L2Tot.power.readOp.leakage<<endl;
+//
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//
+//  maxPower          +=  llCache.maxPower;
+//  ///cout<<"maxpower=" <<maxPower<<endl;
+//
+////  maxPower   +=  pipeLogicCache.power.readOp.dynamic*clockRate;
+////  ///cout<<"pipeLogic.power="<<pipeLogicCache.power.readOp.dynamic*clockRate<<endl;
+////  ///cout<<"maxpower=" <<maxPower<<endl;
+////
+////  maxPower   +=  pipeLogicDirectory.power.readOp.dynamic*clockRate;
+////  ///cout<<"pipeLogic.power="<<pipeLogicDirectory.power.readOp.dynamic*clockRate<<endl;
+////  ///cout<<"maxpower=" <<maxPower<<endl;
+////
+////  //clock power
+////  maxPower += clockNetwork.total_power.readOp.dynamic*clockRate;
+////  ///cout<<"clockNetwork.total_power="<<clockNetwork.total_power.readOp.dynamic*clockRate<<endl;
+////  ///cout<<"maxpower=" <<maxPower<<endl;
+//
+//}
+
+void SharedCache::set_cache_param()
+{
+        if (cacheL==L2)
+        {
+                cachep.name = "L2";
+                cachep.clockRate       = XML->sys.L2[ithCache].clockrate;
+                cachep.clockRate       *= 1e6;
+                cachep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+                interface_ip.data_arr_ram_cell_tech_type    = XML->sys.L2[ithCache].device_type;//long channel device LSTP
+                interface_ip.data_arr_peri_global_tech_type = XML->sys.L2[ithCache].device_type;
+                interface_ip.tag_arr_ram_cell_tech_type     = XML->sys.L2[ithCache].device_type;
+                interface_ip.tag_arr_peri_global_tech_type  = XML->sys.L2[ithCache].device_type;
+                cachep.capacity      = XML->sys.L2[ithCache].L2_config[0];
+                cachep.blockW        = XML->sys.L2[ithCache].L2_config[1];
+                cachep.assoc         = XML->sys.L2[ithCache].L2_config[2];
+                cachep.nbanks        = XML->sys.L2[ithCache].L2_config[3];
+                cachep.throughput    = XML->sys.L2[ithCache].L2_config[4]/cachep.clockRate;
+                cachep.latency       = XML->sys.L2[ithCache].L2_config[5]/cachep.clockRate;
+                cachep.missb_size    = XML->sys.L2[ithCache].buffer_sizes[0];
+                cachep.fu_size       = XML->sys.L2[ithCache].buffer_sizes[1];
+                cachep.prefetchb_size= XML->sys.L2[ithCache].buffer_sizes[2];
+                cachep.wbb_size      = XML->sys.L2[ithCache].buffer_sizes[3];
+                cachep.duty_cycle    = XML->sys.L2[ithCache].duty_cycle;
+                if (!XML->sys.L2[ithCache].merged_dir)
+                {
+                        cachep.dir_ty = NonDir;
+                }
+                else
+                {
+                        cachep.dir_ty = SBT;
+                        cachep.dir_duty_cycle  = XML->sys.L2[ithCache].dir_duty_cycle;
+                }
+        }
+        else if (cacheL==L3)
+        {
+                cachep.name = "L3";
+                cachep.clockRate       = XML->sys.L3[ithCache].clockrate;
+                cachep.clockRate       *= 1e6;
+                cachep.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+                interface_ip.data_arr_ram_cell_tech_type    = XML->sys.L3[ithCache].device_type;//long channel device LSTP
+                interface_ip.data_arr_peri_global_tech_type = XML->sys.L3[ithCache].device_type;
+                interface_ip.tag_arr_ram_cell_tech_type     = XML->sys.L3[ithCache].device_type;
+                interface_ip.tag_arr_peri_global_tech_type  = XML->sys.L3[ithCache].device_type;
+                cachep.capacity      = XML->sys.L3[ithCache].L3_config[0];
+                cachep.blockW        = XML->sys.L3[ithCache].L3_config[1];
+                cachep.assoc         = XML->sys.L3[ithCache].L3_config[2];
+                cachep.nbanks        = XML->sys.L3[ithCache].L3_config[3];
+                cachep.throughput    = XML->sys.L3[ithCache].L3_config[4]/cachep.clockRate;
+                cachep.latency       = XML->sys.L3[ithCache].L3_config[5]/cachep.clockRate;
+                cachep.missb_size    = XML->sys.L3[ithCache].buffer_sizes[0];
+                cachep.fu_size       = XML->sys.L3[ithCache].buffer_sizes[1];
+                cachep.prefetchb_size= XML->sys.L3[ithCache].buffer_sizes[2];
+                cachep.wbb_size      = XML->sys.L3[ithCache].buffer_sizes[3];
+                cachep.duty_cycle    = XML->sys.L3[ithCache].duty_cycle;
+                if (!XML->sys.L2[ithCache].merged_dir)
+                {
+                        cachep.dir_ty = NonDir;
+                }
+                else
+                {
+                        cachep.dir_ty = SBT;
+                        cachep.dir_duty_cycle  = XML->sys.L2[ithCache].dir_duty_cycle;
+                }
+        }
+        else if (cacheL==L1Directory)
+                {
+                        cachep.name = "First Level Directory";
+                        cachep.dir_ty = (enum Dir_type) XML->sys.L1Directory[ithCache].Directory_type;
+                        cachep.clockRate       = XML->sys.L1Directory[ithCache].clockrate;
+                        cachep.clockRate       *= 1e6;
+                        cachep.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+                        interface_ip.data_arr_ram_cell_tech_type    = XML->sys.L1Directory[ithCache].device_type;//long channel device LSTP
+                        interface_ip.data_arr_peri_global_tech_type = XML->sys.L1Directory[ithCache].device_type;
+                        interface_ip.tag_arr_ram_cell_tech_type     = XML->sys.L1Directory[ithCache].device_type;
+                        interface_ip.tag_arr_peri_global_tech_type  = XML->sys.L1Directory[ithCache].device_type;
+                        cachep.capacity      = XML->sys.L1Directory[ithCache].Dir_config[0];
+                        cachep.blockW        = XML->sys.L1Directory[ithCache].Dir_config[1];
+                        cachep.assoc         = XML->sys.L1Directory[ithCache].Dir_config[2];
+                        cachep.nbanks        = XML->sys.L1Directory[ithCache].Dir_config[3];
+                        cachep.throughput    = XML->sys.L1Directory[ithCache].Dir_config[4]/cachep.clockRate;
+                        cachep.latency       = XML->sys.L1Directory[ithCache].Dir_config[5]/cachep.clockRate;
+                        cachep.missb_size    = XML->sys.L1Directory[ithCache].buffer_sizes[0];
+                        cachep.fu_size       = XML->sys.L1Directory[ithCache].buffer_sizes[1];
+                        cachep.prefetchb_size= XML->sys.L1Directory[ithCache].buffer_sizes[2];
+                        cachep.wbb_size      = XML->sys.L1Directory[ithCache].buffer_sizes[3];
+                        cachep.duty_cycle    = XML->sys.L1Directory[ithCache].duty_cycle;
+                }
+        else if (cacheL==L2Directory)
+                {
+                        cachep.name = "Second Level Directory";
+                        cachep.dir_ty = (enum Dir_type) XML->sys.L2Directory[ithCache].Directory_type;
+                        cachep.clockRate       = XML->sys.L2Directory[ithCache].clockrate;
+                        cachep.clockRate       *= 1e6;
+                        cachep.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+                        interface_ip.data_arr_ram_cell_tech_type    = XML->sys.L2Directory[ithCache].device_type;//long channel device LSTP
+                        interface_ip.data_arr_peri_global_tech_type = XML->sys.L2Directory[ithCache].device_type;
+                        interface_ip.tag_arr_ram_cell_tech_type     = XML->sys.L2Directory[ithCache].device_type;
+                        interface_ip.tag_arr_peri_global_tech_type  = XML->sys.L2Directory[ithCache].device_type;
+                        cachep.capacity      = XML->sys.L2Directory[ithCache].Dir_config[0];
+                        cachep.blockW        = XML->sys.L2Directory[ithCache].Dir_config[1];
+                        cachep.assoc         = XML->sys.L2Directory[ithCache].Dir_config[2];
+                        cachep.nbanks        = XML->sys.L2Directory[ithCache].Dir_config[3];
+                        cachep.throughput    = XML->sys.L2Directory[ithCache].Dir_config[4]/cachep.clockRate;
+                        cachep.latency       = XML->sys.L2Directory[ithCache].Dir_config[5]/cachep.clockRate;
+                        cachep.missb_size    = XML->sys.L2Directory[ithCache].buffer_sizes[0];
+                        cachep.fu_size       = XML->sys.L2Directory[ithCache].buffer_sizes[1];
+                        cachep.prefetchb_size= XML->sys.L2Directory[ithCache].buffer_sizes[2];
+                        cachep.wbb_size      = XML->sys.L2Directory[ithCache].buffer_sizes[3];
+                        cachep.duty_cycle    = XML->sys.L2Directory[ithCache].duty_cycle;
+                }
+        //cachep.cache_duty_cycle=cachep.dir_duty_cycle = 0.35;
+}
+
diff --git a/ext/mcpat/sharedcache.h b/ext/mcpat/sharedcache.h
new file mode 100644 (file)
index 0000000..9234084
--- /dev/null
@@ -0,0 +1,89 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef SHAREDCACHE_H_
+#define SHAREDCACHE_H_
+#include <vector>
+
+#include "XML_Parse.h"
+#include "area.h"
+#include "array.h"
+#include "basic_components.h"
+#include "logic.h"
+#include "parameter.h"
+
+class SharedCache :public Component{
+  public:
+    ParseXML * XML;
+    int ithCache;
+        InputParameter interface_ip;
+        enum cache_level cacheL;
+    DataCache unicache;//Shared cache
+    CacheDynParam cachep;
+    statsDef   homenode_tdp_stats;
+    statsDef   homenode_rtp_stats;
+    statsDef   homenode_stats_t;
+    double        dir_overhead;
+    // cache_processor llCache,directory, directory1, inv_dir;
+
+    //pipeline pipeLogicCache, pipeLogicDirectory;
+    //clock_network                            clockNetwork;
+    double scktRatio, executionTime;
+    //   Component L2Tot, cc, cc1, ccTot;
+
+    SharedCache(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_,enum cache_level cacheL_ =L2);
+    void set_cache_param();
+        void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,bool is_tdp=true);
+    ~SharedCache(){};
+};
+
+class CCdir :public Component{
+  public:
+    ParseXML * XML;
+    int ithCache;
+        InputParameter interface_ip;
+    DataCache dc;//Shared cache
+    ArrayST * shadow_dir;
+//     cache_processor llCache,directory, directory1, inv_dir;
+
+    //pipeline pipeLogicCache, pipeLogicDirectory;
+    //clock_network                            clockNetwork;
+    double scktRatio, clockRate, executionTime;
+    Component L2Tot, cc, cc1, ccTot;
+
+    CCdir(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_);
+    void computeEnergy(bool is_tdp=true);
+    void displayEnergy(uint32_t indent = 0,bool is_tdp=true);
+    ~CCdir();
+};
+
+#endif /* SHAREDCACHE_H_ */
diff --git a/ext/mcpat/technology_xeon_core.cc b/ext/mcpat/technology_xeon_core.cc
new file mode 100644 (file)
index 0000000..4e60edc
--- /dev/null
@@ -0,0 +1,2772 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include "basic_circuit.h"
+
+#include "parameter.h"
+
+double wire_resistance(double resistivity, double wire_width, double wire_thickness,
+    double barrier_thickness, double dishing_thickness, double alpha_scatter)
+{
+  double resistance;
+  resistance = alpha_scatter * resistivity /((wire_thickness - barrier_thickness - dishing_thickness)*(wire_width - 2 * barrier_thickness));
+  return(resistance);
+}
+
+double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing,
+    double ild_thickness, double miller_value, double horiz_dielectric_constant,
+    double vert_dielectric_constant, double fringe_cap)
+{
+  double vertical_cap, sidewall_cap, total_cap;
+  vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant * wire_width / ild_thickness;
+  sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value * horiz_dielectric_constant * wire_thickness / wire_spacing;
+  total_cap = vertical_cap + sidewall_cap + fringe_cap;
+  return(total_cap);
+}
+
+
+void init_tech_params(double technology, bool is_tag)
+{
+  int    iter, tech, tech_lo, tech_hi;
+  double curr_alpha, curr_vpp;
+  double wire_width, wire_thickness, wire_spacing,
+         fringe_cap, pmos_to_nmos_sizing_r;
+//  double aspect_ratio,ild_thickness, miller_value = 1.5, horiz_dielectric_constant, vert_dielectric_constant;
+  double barrier_thickness, dishing_thickness, alpha_scatter;
+  double curr_vdd_dram_cell, curr_v_th_dram_access_transistor, curr_I_on_dram_cell, curr_c_dram_cell;
+
+  uint32_t ram_cell_tech_type    = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
+  uint32_t peri_global_tech_type = (is_tag) ? g_ip->tag_arr_peri_global_tech_type : g_ip->data_arr_peri_global_tech_type;
+
+  technology  = technology * 1000.0;  // in the unit of nm
+
+  // initialize parameters
+  g_tp.reset();
+  double gmp_to_gmn_multiplier_periph_global = 0;
+
+  double curr_Wmemcella_dram, curr_Wmemcellpmos_dram, curr_Wmemcellnmos_dram,
+         curr_area_cell_dram, curr_asp_ratio_cell_dram, curr_Wmemcella_sram,
+         curr_Wmemcellpmos_sram, curr_Wmemcellnmos_sram, curr_area_cell_sram,
+         curr_asp_ratio_cell_sram, curr_I_off_dram_cell_worst_case_length_temp;
+  double curr_Wmemcella_cam, curr_Wmemcellpmos_cam, curr_Wmemcellnmos_cam, curr_area_cell_cam,//Sheng: CAM data
+         curr_asp_ratio_cell_cam;
+  double SENSE_AMP_D, SENSE_AMP_P; // J
+  double area_cell_dram = 0;
+  double asp_ratio_cell_dram = 0;
+  double area_cell_sram = 0;
+  double asp_ratio_cell_sram = 0;
+  double area_cell_cam = 0;
+  double asp_ratio_cell_cam = 0;
+  double mobility_eff_periph_global = 0;
+  double Vdsat_periph_global = 0;
+  double nmos_effective_resistance_multiplier;
+  double width_dram_access_transistor;
+
+  double curr_logic_scaling_co_eff = 0;//This is based on the reported numbers of Intel Merom 65nm, Penryn45nm and IBM cell 90/65/45 date
+  double curr_core_tx_density = 0;//this is density per um^2; 90, ...22nm based on Intel Penryn
+  double curr_chip_layout_overhead = 0;
+  double curr_macro_layout_overhead = 0;
+  double curr_sckt_co_eff = 0;
+
+  if (technology < 91 && technology > 89)
+  {
+    tech_lo = 90;
+    tech_hi = 90;
+  }
+  else if (technology < 66 && technology > 64)
+  {
+    tech_lo = 65;
+    tech_hi = 65;
+  }
+  else if (technology < 46 && technology > 44)
+  {
+    tech_lo = 45;
+    tech_hi = 45;
+  }
+  else if (technology < 33 && technology > 31)
+  {
+    tech_lo = 32;
+    tech_hi = 32;
+  }
+  else if (technology < 23 && technology > 21)
+  {
+    tech_lo = 22;
+    tech_hi = 22;
+    if (ram_cell_tech_type == 3)
+    {
+       cout<<"current version does not support eDRAM technologies at 22nm"<<endl;
+       exit(0);
+    }
+  }
+//  else if (technology < 17 && technology > 15)
+//  {
+//    tech_lo = 16;
+//    tech_hi = 16;
+//  }
+  else if (technology < 90 && technology > 65)
+  {
+    tech_lo = 90;
+    tech_hi = 65;
+  }
+  else if (technology < 65 && technology > 45)
+  {
+    tech_lo = 65;
+    tech_hi = 45;
+  }
+  else if (technology < 45 && technology > 32)
+  {
+    tech_lo = 45;
+    tech_hi = 32;
+  }
+  else if (technology < 32 && technology > 22)
+    {
+      tech_lo = 32;
+      tech_hi = 22;
+    }
+//  else if (technology < 22 && technology > 16)
+//    {
+//      tech_lo = 22;
+//      tech_hi = 16;
+//    }
+      else
+    {
+          cout<<"Invalid technology nodes"<<endl;
+          exit(0);
+    }
+
+  double vdd[NUMBER_TECH_FLAVORS];
+  double Lphy[NUMBER_TECH_FLAVORS];
+  double Lelec[NUMBER_TECH_FLAVORS];
+  double t_ox[NUMBER_TECH_FLAVORS];
+  double v_th[NUMBER_TECH_FLAVORS];
+  double c_ox[NUMBER_TECH_FLAVORS];
+  double mobility_eff[NUMBER_TECH_FLAVORS];
+  double Vdsat[NUMBER_TECH_FLAVORS];
+  double c_g_ideal[NUMBER_TECH_FLAVORS];
+  double c_fringe[NUMBER_TECH_FLAVORS];
+  double c_junc[NUMBER_TECH_FLAVORS];
+  double I_on_n[NUMBER_TECH_FLAVORS];
+  double I_on_p[NUMBER_TECH_FLAVORS];
+  double Rnchannelon[NUMBER_TECH_FLAVORS];
+  double Rpchannelon[NUMBER_TECH_FLAVORS];
+  double n_to_p_eff_curr_drv_ratio[NUMBER_TECH_FLAVORS];
+  double I_off_n[NUMBER_TECH_FLAVORS][101];
+  double I_g_on_n[NUMBER_TECH_FLAVORS][101];
+  //double I_off_p[NUMBER_TECH_FLAVORS][101];
+  double gmp_to_gmn_multiplier[NUMBER_TECH_FLAVORS];
+  //double curr_sckt_co_eff[NUMBER_TECH_FLAVORS];
+  double long_channel_leakage_reduction[NUMBER_TECH_FLAVORS];
+
+  for (iter = 0; iter <= 1; ++iter)
+  {
+    // linear interpolation
+    if (iter == 0)
+    {
+      tech = tech_lo;
+      if (tech_lo == tech_hi)
+      {
+        curr_alpha = 1;
+      }
+      else
+      {
+        curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi);
+      }
+    }
+    else
+    {
+      tech = tech_hi;
+      if (tech_lo == tech_hi)
+      {
+        break;
+      }
+      else
+      {
+        curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 90)
+    {
+      SENSE_AMP_D = .28e-9; // s
+      SENSE_AMP_P = 14.7e-15; // J
+      //90nm technology-node. Corresponds to year 2004 in ITRS
+      //ITRS HP device type
+      vdd[0]   = 1.2;
+      Lphy[0]  = 0.037;//Lphy is the physical gate-length. micron
+      Lelec[0] = 0.0266;//Lelec is the electrical gate-length. micron
+      t_ox[0]  = 1.2e-3;//micron
+      v_th[0]  = 0.23707;//V
+      c_ox[0]  = 1.79e-14;//F/micron2
+      mobility_eff[0] = 342.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+      Vdsat[0] = 0.128; //V
+      c_g_ideal[0] = 6.64e-16;//F/micron
+      c_fringe[0]  = 0.08e-15;//F/micron
+      c_junc[0] = 1e-15;//F/micron2
+      I_on_n[0] = 1076.9e-6;//A/micron
+      I_on_p[0] = 712.6e-6;//A/micron
+      //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline
+      nmos_effective_resistance_multiplier = 1.54;
+      n_to_p_eff_curr_drv_ratio[0] = 2.45;
+      gmp_to_gmn_multiplier[0] = 1.22;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1;
+      I_off_n[0][0]  = 3.24e-8;//A/micron
+      I_off_n[0][10] = 4.01e-8;
+      I_off_n[0][20] = 4.90e-8;
+      I_off_n[0][30] = 5.92e-8;
+      I_off_n[0][40] = 7.08e-8;
+      I_off_n[0][50] = 8.38e-8;
+      I_off_n[0][60] = 9.82e-8;
+      I_off_n[0][70] = 1.14e-7;
+      I_off_n[0][80] = 1.29e-7;
+      I_off_n[0][90] = 1.43e-7;
+      I_off_n[0][100] = 1.54e-7;
+
+      I_g_on_n[0][0]  = 1.65e-8;//A/micron
+      I_g_on_n[0][10] = 1.65e-8;
+      I_g_on_n[0][20] = 1.65e-8;
+      I_g_on_n[0][30] = 1.65e-8;
+      I_g_on_n[0][40] = 1.65e-8;
+      I_g_on_n[0][50] = 1.65e-8;
+      I_g_on_n[0][60] = 1.65e-8;
+      I_g_on_n[0][70] = 1.65e-8;
+      I_g_on_n[0][80] = 1.65e-8;
+      I_g_on_n[0][90] = 1.65e-8;
+      I_g_on_n[0][100] = 1.65e-8;
+
+      //ITRS LSTP device type
+      vdd[1]   = 1.3;
+      Lphy[1]  = 0.075;
+      Lelec[1] = 0.0486;
+      t_ox[1]  = 2.2e-3;
+      v_th[1]  = 0.48203;
+      c_ox[1]  = 1.22e-14;
+      mobility_eff[1] = 356.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.373;
+      c_g_ideal[1] = 9.15e-16;
+      c_fringe[1]  = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 503.6e-6;
+      I_on_p[1] = 235.1e-6;
+      nmos_effective_resistance_multiplier = 1.92;
+      n_to_p_eff_curr_drv_ratio[1] = 2.44;
+      gmp_to_gmn_multiplier[1] =0.88;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1;
+      I_off_n[1][0]  = 2.81e-12;
+      I_off_n[1][10] = 4.76e-12;
+      I_off_n[1][20] = 7.82e-12;
+      I_off_n[1][30] = 1.25e-11;
+      I_off_n[1][40] = 1.94e-11;
+      I_off_n[1][50] = 2.94e-11;
+      I_off_n[1][60] = 4.36e-11;
+      I_off_n[1][70] = 6.32e-11;
+      I_off_n[1][80] = 8.95e-11;
+      I_off_n[1][90] = 1.25e-10;
+      I_off_n[1][100] = 1.7e-10;
+
+      I_g_on_n[1][0]  = 3.87e-11;//A/micron
+      I_g_on_n[1][10] = 3.87e-11;
+      I_g_on_n[1][20] = 3.87e-11;
+      I_g_on_n[1][30] = 3.87e-11;
+      I_g_on_n[1][40] = 3.87e-11;
+      I_g_on_n[1][50] = 3.87e-11;
+      I_g_on_n[1][60] = 3.87e-11;
+      I_g_on_n[1][70] = 3.87e-11;
+      I_g_on_n[1][80] = 3.87e-11;
+      I_g_on_n[1][90] = 3.87e-11;
+      I_g_on_n[1][100] = 3.87e-11;
+
+      //ITRS LOP device type
+      vdd[2] = 0.9;
+      Lphy[2] = 0.053;
+      Lelec[2] = 0.0354;
+      t_ox[2] = 1.5e-3;
+      v_th[2] = 0.30764;
+      c_ox[2] = 1.59e-14;
+      mobility_eff[2] = 460.39 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.113;
+      c_g_ideal[2] = 8.45e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 386.6e-6;
+      I_on_p[2] = 209.7e-6;
+      nmos_effective_resistance_multiplier = 1.77;
+      n_to_p_eff_curr_drv_ratio[2] = 2.54;
+      gmp_to_gmn_multiplier[2] = 0.98;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1;
+      I_off_n[2][0] = 2.14e-9;
+      I_off_n[2][10] = 2.9e-9;
+      I_off_n[2][20] = 3.87e-9;
+      I_off_n[2][30] = 5.07e-9;
+      I_off_n[2][40] = 6.54e-9;
+      I_off_n[2][50] = 8.27e-8;
+      I_off_n[2][60] = 1.02e-7;
+      I_off_n[2][70] = 1.20e-7;
+      I_off_n[2][80] = 1.36e-8;
+      I_off_n[2][90] = 1.52e-8;
+      I_off_n[2][100] = 1.73e-8;
+
+      I_g_on_n[2][0]  = 4.31e-8;//A/micron
+      I_g_on_n[2][10] = 4.31e-8;
+      I_g_on_n[2][20] = 4.31e-8;
+      I_g_on_n[2][30] = 4.31e-8;
+      I_g_on_n[2][40] = 4.31e-8;
+      I_g_on_n[2][50] = 4.31e-8;
+      I_g_on_n[2][60] = 4.31e-8;
+      I_g_on_n[2][70] = 4.31e-8;
+      I_g_on_n[2][80] = 4.31e-8;
+      I_g_on_n[2][90] = 4.31e-8;
+      I_g_on_n[2][100] = 4.31e-8;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.4545;
+        width_dram_access_transistor = 0.14;
+        curr_I_on_dram_cell = 45e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 21.1e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.168;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.4545;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] =  323.95 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.3;
+        c_g_ideal[3] = 1.47e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 321.6e-6;
+        I_on_p[3] = 203.3e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.42e-11;
+        I_off_n[3][10] = 2.25e-11;
+        I_off_n[3][20] = 3.46e-11;
+        I_off_n[3][30] = 5.18e-11;
+        I_off_n[3][40] = 7.58e-11;
+        I_off_n[3][50] = 1.08e-10;
+        I_off_n[3][60] = 1.51e-10;
+        I_off_n[3][70] = 2.02e-10;
+        I_off_n[3][80] = 2.57e-10;
+        I_off_n[3][90] = 3.14e-10;
+        I_off_n[3][100] = 3.85e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.6;
+        Lphy[3] = 0.09;
+        Lelec[3] = 0.0576;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.09*0.09;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.7;
+        t_ox[3] = 5.5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 5.65e-15;
+        mobility_eff[3] =  302.2 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.32;
+        c_g_ideal[3] = 5.08e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1094.3e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.62;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 5.80e-15;
+        I_off_n[3][10] = 1.21e-14;
+        I_off_n[3][20] = 2.42e-14;
+        I_off_n[3][30] = 4.65e-14;
+        I_off_n[3][40] = 8.60e-14;
+        I_off_n[3][50] = 1.54e-13;
+        I_off_n[3][60] = 2.66e-13;
+        I_off_n[3][70] = 4.45e-13;
+        I_off_n[3][80] = 7.17e-13;
+        I_off_n[3][90] = 1.11e-12;
+        I_off_n[3][100] = 1.67e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360
+      curr_asp_ratio_cell_cam = 2.92;//2.5
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff  = 1;
+      curr_core_tx_density       = 1.25*0.7*0.7;
+      curr_sckt_co_eff           = 1.1539;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+
+
+    }
+
+    if (tech == 65)
+    { //65nm technology-node. Corresponds to year 2007 in ITRS
+      //ITRS HP device type
+//      SENSE_AMP_D = .2e-9; // s
+//      SENSE_AMP_P = 5.7e-15; // J
+//      vdd[0] = 1.1;
+//      Lphy[0] = 0.025;
+//      Lelec[0] = 0.019;
+//      t_ox[0] = 1.1e-3;
+//      v_th[0] = .19491;
+//      c_ox[0] = 1.88e-14;
+//      mobility_eff[0] = 436.24 * (1e-2 * 1e6 * 1e-2 * 1e6);
+//      Vdsat[0] = 7.71e-2;
+//      c_g_ideal[0] = 4.69e-16;
+//      c_fringe[0] = 0.077e-15;
+//      c_junc[0] = 1e-15;
+//      I_on_n[0] = 1197.2e-6;
+//      I_on_p[0] = 870.8e-6;
+//      nmos_effective_resistance_multiplier = 1.50;
+//      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+//      gmp_to_gmn_multiplier[0] = 1.38;
+//      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+//      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+//      long_channel_leakage_reduction[0] = 1/3.74;
+//      //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first
+//      //Ioff(Lgate normal)/Ioff(Lgate long)= 3.74.
+//      I_off_n[0][0] = 1.96e-7;
+//      I_off_n[0][10] = 2.29e-7;
+//      I_off_n[0][20] = 2.66e-7;
+//      I_off_n[0][30] = 3.05e-7;
+//      I_off_n[0][40] = 3.49e-7;
+//      I_off_n[0][50] = 3.95e-7;
+//      I_off_n[0][60] = 4.45e-7;
+//      I_off_n[0][70] = 4.97e-7;
+//      I_off_n[0][80] = 5.48e-7;
+//      I_off_n[0][90] = 5.94e-7;
+//      I_off_n[0][100] = 6.3e-7;
+//      I_g_on_n[0][0]  = 4.09e-8;//A/micron
+//      I_g_on_n[0][10] = 4.09e-8;
+//      I_g_on_n[0][20] = 4.09e-8;
+//      I_g_on_n[0][30] = 4.09e-8;
+//      I_g_on_n[0][40] = 4.09e-8;
+//      I_g_on_n[0][50] = 4.09e-8;
+//      I_g_on_n[0][60] = 4.09e-8;
+//      I_g_on_n[0][70] = 4.09e-8;
+//      I_g_on_n[0][80] = 4.09e-8;
+//      I_g_on_n[0][90] = 4.09e-8;
+//      I_g_on_n[0][100] = 4.09e-8;
+
+        SENSE_AMP_D = .2e-9; // s
+        SENSE_AMP_P = 5.7e-15; // J
+        vdd[0] = 1.25;
+        Lphy[0] = 0.025;
+        Lelec[0] = 0.019;
+        t_ox[0] = 1.1e-3;
+        v_th[0] = .12491;
+        c_ox[0] = 1.88e-14;
+        mobility_eff[0] = 409.31 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[0] = 9.08e-2;
+        c_g_ideal[0] = 4.72e-16;
+        c_fringe[0] = 0.08e-15;
+        c_junc[0] = 1e-15;
+        I_on_n[0] = 1486.4e-6;
+        I_on_p[0] = 1131.5e-6;
+        nmos_effective_resistance_multiplier = 1.57;
+        n_to_p_eff_curr_drv_ratio[0] = 2;
+        gmp_to_gmn_multiplier[0] = 1.38;
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+        long_channel_leakage_reduction[0] = 1.0/4.97;
+        //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first
+        //Ioff(Lgate normal)/Ioff(Lgate long)= 4.97@Vdd=1.25; (3.74@Vdd=1.1), however, Intel paper suggest the reduction factor is 3.
+        I_off_n[0][0]  = 8.62e-7;
+        I_off_n[0][10] = 9.08e-7;
+        I_off_n[0][20] = 9.55e-7;
+        I_off_n[0][30] = 1.00e-6;
+        I_off_n[0][40] = 1.05e-6;
+        I_off_n[0][50] = 1.09e-6;
+        I_off_n[0][60] = 1.14e-6;
+        I_off_n[0][70] = 1.18e-6;
+        I_off_n[0][80] = 1.23e-6;
+        I_off_n[0][90] = 1.27e-6;
+        I_off_n[0][100] = 1.31e-6;
+
+
+        I_g_on_n[0][0]  = 7.02e-8;//A/micron
+        I_g_on_n[0][10] = 7.02e-8;
+        I_g_on_n[0][20] = 7.02e-8;
+        I_g_on_n[0][30] = 7.02e-8;
+        I_g_on_n[0][40] = 7.02e-8;
+        I_g_on_n[0][50] = 7.02e-8;
+        I_g_on_n[0][60] = 7.02e-8;
+        I_g_on_n[0][70] = 7.02e-8;
+        I_g_on_n[0][80] = 7.02e-8;
+        I_g_on_n[0][90] = 7.02e-8;
+        I_g_on_n[0][100] = 7.02e-8;
+
+      //ITRS LSTP device type
+      vdd[1] = 1.2;
+      Lphy[1] = 0.045;
+      Lelec[1] = 0.0298;
+      t_ox[1] = 1.9e-3;
+      v_th[1] = 0.52354;
+      c_ox[1] = 1.36e-14;
+      mobility_eff[1] = 341.21 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.128;
+      c_g_ideal[1] = 6.14e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 519.2e-6;
+      I_on_p[1] = 266e-6;
+      nmos_effective_resistance_multiplier = 1.96;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/2.82;
+      I_off_n[1][0] = 9.12e-12;
+      I_off_n[1][10] = 1.49e-11;
+      I_off_n[1][20] = 2.36e-11;
+      I_off_n[1][30] = 3.64e-11;
+      I_off_n[1][40] = 5.48e-11;
+      I_off_n[1][50] = 8.05e-11;
+      I_off_n[1][60] = 1.15e-10;
+      I_off_n[1][70] = 1.59e-10;
+      I_off_n[1][80] = 2.1e-10;
+      I_off_n[1][90] = 2.62e-10;
+      I_off_n[1][100] = 3.21e-10;
+
+      I_g_on_n[1][0]  = 1.09e-10;//A/micron
+      I_g_on_n[1][10] = 1.09e-10;
+      I_g_on_n[1][20] = 1.09e-10;
+      I_g_on_n[1][30] = 1.09e-10;
+      I_g_on_n[1][40] = 1.09e-10;
+      I_g_on_n[1][50] = 1.09e-10;
+      I_g_on_n[1][60] = 1.09e-10;
+      I_g_on_n[1][70] = 1.09e-10;
+      I_g_on_n[1][80] = 1.09e-10;
+      I_g_on_n[1][90] = 1.09e-10;
+      I_g_on_n[1][100] = 1.09e-10;
+
+      //ITRS LOP device type
+      vdd[2] = 0.8;
+      Lphy[2] = 0.032;
+      Lelec[2] = 0.0216;
+      t_ox[2] = 1.2e-3;
+      v_th[2] = 0.28512;
+      c_ox[2] = 1.87e-14;
+      mobility_eff[2] = 495.19 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.292;
+      c_g_ideal[2] = 6e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 573.1e-6;
+      I_on_p[2] = 340.6e-6;
+      nmos_effective_resistance_multiplier = 1.82;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/2.05;
+      I_off_n[2][0] = 4.9e-9;
+      I_off_n[2][10] = 6.49e-9;
+      I_off_n[2][20] = 8.45e-9;
+      I_off_n[2][30] = 1.08e-8;
+      I_off_n[2][40] = 1.37e-8;
+      I_off_n[2][50] = 1.71e-8;
+      I_off_n[2][60] = 2.09e-8;
+      I_off_n[2][70] = 2.48e-8;
+      I_off_n[2][80] = 2.84e-8;
+      I_off_n[2][90] = 3.13e-8;
+      I_off_n[2][100] = 3.42e-8;
+
+      I_g_on_n[2][0]  = 9.61e-9;//A/micron
+      I_g_on_n[2][10] = 9.61e-9;
+      I_g_on_n[2][20] = 9.61e-9;
+      I_g_on_n[2][30] = 9.61e-9;
+      I_g_on_n[2][40] = 9.61e-9;
+      I_g_on_n[2][50] = 9.61e-9;
+      I_g_on_n[2][60] = 9.61e-9;
+      I_g_on_n[2][70] = 9.61e-9;
+      I_g_on_n[2][80] = 9.61e-9;
+      I_g_on_n[2][90] = 9.61e-9;
+      I_g_on_n[2][100] = 9.61e-9;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.43806;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 19.6e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.11;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.43806;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] =  328.32 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.43806;
+        c_g_ideal[3] = 1.46e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15 ;
+        I_on_n[3] = 399.8e-6;
+        I_on_p[3] = 243.4e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 2.23e-11;
+        I_off_n[3][10] = 3.46e-11;
+        I_off_n[3][20] = 5.24e-11;
+        I_off_n[3][30] = 7.75e-11;
+        I_off_n[3][40] = 1.12e-10;
+        I_off_n[3][50] = 1.58e-10;
+        I_off_n[3][60] = 2.18e-10;
+        I_off_n[3][70] = 2.88e-10;
+        I_off_n[3][80] = 3.63e-10;
+        I_off_n[3][90] = 4.41e-10;
+        I_off_n[3][100] = 5.36e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.3;
+        Lphy[3] = 0.065;
+        Lelec[3] = 0.0426;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.065;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.065*0.065;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.3;
+        t_ox[3] = 5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 6.16e-15;
+        mobility_eff[3] =  303.44 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.385;
+        c_g_ideal[3] = 4e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15 ;
+        I_on_n[3] = 1031e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 2.39;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 1.80e-14;
+        I_off_n[3][10] = 3.64e-14;
+        I_off_n[3][20] = 7.03e-14;
+        I_off_n[3][30] = 1.31e-13;
+        I_off_n[3][40] = 2.35e-13;
+        I_off_n[3][50] = 4.09e-13;
+        I_off_n[3][60] = 6.89e-13;
+        I_off_n[3][70] = 1.13e-12;
+        I_off_n[3][80] = 1.78e-12;
+        I_off_n[3][90] = 2.71e-12;
+        I_off_n[3][100] = 3.99e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7;
+      curr_core_tx_density      = 1.25*0.7;
+      curr_sckt_co_eff           = 1.1359;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 45)
+    { //45nm technology-node. Corresponds to year 2010 in ITRS
+      //ITRS HP device type
+      SENSE_AMP_D = .04e-9; // s
+      SENSE_AMP_P = 2.7e-15; // J
+      vdd[0] = 1.0;
+      Lphy[0] = 0.018;
+      Lelec[0] = 0.01345;
+      t_ox[0] = 0.65e-3;
+      v_th[0] = .18035;
+      c_ox[0] = 3.77e-14;
+      mobility_eff[0] = 266.68 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 9.38E-2;
+      c_g_ideal[0] = 6.78e-16;
+      c_fringe[0] = 0.05e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 2046.6e-6;
+      //There are certain problems with the ITRS PMOS numbers in MASTAR for 45nm. So we are using 65nm values of
+      //n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier for 45nm
+      I_on_p[0] = I_on_n[0] / 2;//This value is fixed arbitrarily but I_on_p is not being used in CACTI
+      nmos_effective_resistance_multiplier = 1.51;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] = 1/3.546;//Using MASTAR, @380K, increase Lgate until Ion reduces to 90%, Ioff(Lgate normal)/Ioff(Lgate long)= 3.74
+      I_off_n[0][0] = 2.8e-7;
+      I_off_n[0][10] = 3.28e-7;
+      I_off_n[0][20] = 3.81e-7;
+      I_off_n[0][30] = 4.39e-7;
+      I_off_n[0][40] = 5.02e-7;
+      I_off_n[0][50] = 5.69e-7;
+      I_off_n[0][60] = 6.42e-7;
+      I_off_n[0][70] = 7.2e-7;
+      I_off_n[0][80] = 8.03e-7;
+      I_off_n[0][90] = 8.91e-7;
+      I_off_n[0][100] = 9.84e-7;
+
+      I_g_on_n[0][0]  = 3.59e-8;//A/micron
+      I_g_on_n[0][10] = 3.59e-8;
+      I_g_on_n[0][20] = 3.59e-8;
+      I_g_on_n[0][30] = 3.59e-8;
+      I_g_on_n[0][40] = 3.59e-8;
+      I_g_on_n[0][50] = 3.59e-8;
+      I_g_on_n[0][60] = 3.59e-8;
+      I_g_on_n[0][70] = 3.59e-8;
+      I_g_on_n[0][80] = 3.59e-8;
+      I_g_on_n[0][90] = 3.59e-8;
+      I_g_on_n[0][100] = 3.59e-8;
+
+      //ITRS LSTP device type
+      vdd[1] = 1.1;
+      Lphy[1] =  0.028;
+      Lelec[1] = 0.0212;
+      t_ox[1] = 1.4e-3;
+      v_th[1] = 0.50245;
+      c_ox[1] = 2.01e-14;
+      mobility_eff[1] =  363.96 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 9.12e-2;
+      c_g_ideal[1] = 5.18e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 666.2e-6;
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/2.08;
+      I_off_n[1][0] = 1.01e-11;
+      I_off_n[1][10] = 1.65e-11;
+      I_off_n[1][20] = 2.62e-11;
+      I_off_n[1][30] = 4.06e-11;
+      I_off_n[1][40] = 6.12e-11;
+      I_off_n[1][50] = 9.02e-11;
+      I_off_n[1][60] = 1.3e-10;
+      I_off_n[1][70] = 1.83e-10;
+      I_off_n[1][80] = 2.51e-10;
+      I_off_n[1][90] = 3.29e-10;
+      I_off_n[1][100] = 4.1e-10;
+
+      I_g_on_n[1][0]  = 9.47e-12;//A/micron
+      I_g_on_n[1][10] = 9.47e-12;
+      I_g_on_n[1][20] = 9.47e-12;
+      I_g_on_n[1][30] = 9.47e-12;
+      I_g_on_n[1][40] = 9.47e-12;
+      I_g_on_n[1][50] = 9.47e-12;
+      I_g_on_n[1][60] = 9.47e-12;
+      I_g_on_n[1][70] = 9.47e-12;
+      I_g_on_n[1][80] = 9.47e-12;
+      I_g_on_n[1][90] = 9.47e-12;
+      I_g_on_n[1][100] = 9.47e-12;
+
+      //ITRS LOP device type
+      vdd[2] = 0.7;
+      Lphy[2] = 0.022;
+      Lelec[2] = 0.016;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.22599;
+      c_ox[2] = 2.82e-14;//F/micron2
+      mobility_eff[2] = 508.9 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 5.71e-2;
+      c_g_ideal[2] = 6.2e-16;
+      c_fringe[2] = 0.073e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 748.9e-6;
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.76;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/1.92;
+      I_off_n[2][0] = 4.03e-9;
+      I_off_n[2][10] = 5.02e-9;
+      I_off_n[2][20] = 6.18e-9;
+      I_off_n[2][30] = 7.51e-9;
+      I_off_n[2][40] = 9.04e-9;
+      I_off_n[2][50] = 1.08e-8;
+      I_off_n[2][60] = 1.27e-8;
+      I_off_n[2][70] = 1.47e-8;
+      I_off_n[2][80] = 1.66e-8;
+      I_off_n[2][90] = 1.84e-8;
+      I_off_n[2][100] = 2.03e-8;
+
+      I_g_on_n[2][0]  = 3.24e-8;//A/micron
+      I_g_on_n[2][10] = 4.01e-8;
+      I_g_on_n[2][20] = 4.90e-8;
+      I_g_on_n[2][30] = 5.92e-8;
+      I_g_on_n[2][40] = 7.08e-8;
+      I_g_on_n[2][50] = 8.38e-8;
+      I_g_on_n[2][60] = 9.82e-8;
+      I_g_on_n[2][70] = 1.14e-7;
+      I_g_on_n[2][80] = 1.29e-7;
+      I_g_on_n[2][90] = 1.43e-7;
+      I_g_on_n[2][100] = 1.54e-7;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.078;
+        Lelec[3] = 0.0504;// Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44559;
+        width_dram_access_transistor = 0.079;
+        curr_I_on_dram_cell = 36e-6;//A
+        curr_I_off_dram_cell_worst_case_length_temp = 19.5e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram  = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2.1e-3;
+        v_th[3] = 0.44559;
+        c_ox[3] = 1.41e-14;
+        mobility_eff[3] =   426.30 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.181;
+        c_g_ideal[3] = 1.10e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 456e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 2.54e-11;
+        I_off_n[3][10] = 3.94e-11;
+        I_off_n[3][20] = 5.95e-11;
+        I_off_n[3][30] = 8.79e-11;
+        I_off_n[3][40] = 1.27e-10;
+        I_off_n[3][50] = 1.79e-10;
+        I_off_n[3][60] = 2.47e-10;
+        I_off_n[3][70] = 3.31e-10;
+        I_off_n[3][80] = 4.26e-10;
+        I_off_n[3][90] = 5.27e-10;
+        I_off_n[3][100] = 6.46e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.045;
+        Lelec[3] = 0.0298;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.045;
+        curr_I_on_dram_cell = 20e-6;//A
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram  = 0;
+        curr_area_cell_dram = 6*0.045*0.045;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.7;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.98e-15;
+        mobility_eff[3] = 368.58 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.147;
+        c_g_ideal[3] = 3.59e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 999.4e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.31e-14;
+        I_off_n[3][10] = 2.68e-14;
+        I_off_n[3][20] = 5.25e-14;
+        I_off_n[3][30] = 9.88e-14;
+        I_off_n[3][40] = 1.79e-13;
+        I_off_n[3][50] = 3.15e-13;
+        I_off_n[3][60] = 5.36e-13;
+        I_off_n[3][70] = 8.86e-13;
+        I_off_n[3][80] = 1.42e-12;
+        I_off_n[3][90] = 2.20e-12;
+        I_off_n[3][100] = 3.29e-12;
+      }
+
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7*0.7;
+      curr_core_tx_density      = 1.25;
+      curr_sckt_co_eff           = 1.1387;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 32)
+    {
+      SENSE_AMP_D = .03e-9; // s
+      SENSE_AMP_P = 2.16e-15; // J
+      //For 2013, MPU/ASIC stagger-contacted M1 half-pitch is 32 nm (so this is 32 nm
+      //technology i.e. FEATURESIZE = 0.032). Using the SOI process numbers for
+      //HP and LSTP.
+      vdd[0] = 0.9;
+      Lphy[0] = 0.013;
+      Lelec[0] = 0.01013;
+      t_ox[0] = 0.5e-3;
+      v_th[0] = 0.21835;
+      c_ox[0] = 4.11e-14;
+      mobility_eff[0] = 361.84 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 5.09E-2;
+      c_g_ideal[0] = 5.34e-16;
+      c_fringe[0] = 0.04e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] =  2211.7e-6;
+      I_on_p[0] = I_on_n[0] / 2;
+      nmos_effective_resistance_multiplier = 1.49;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1/3.706;
+      //Using MASTAR, @300K (380K does not work in MASTAR), increase Lgate until Ion reduces to 95% or Lgate increase by 5% (DG device can only increase by 5%),
+      //whichever comes first
+      I_off_n[0][0] = 1.52e-7;
+      I_off_n[0][10] = 1.55e-7;
+      I_off_n[0][20] = 1.59e-7;
+      I_off_n[0][30] = 1.68e-7;
+      I_off_n[0][40] = 1.90e-7;
+      I_off_n[0][50] = 2.69e-7;
+      I_off_n[0][60] = 5.32e-7;
+      I_off_n[0][70] = 1.02e-6;
+      I_off_n[0][80] = 1.62e-6;
+      I_off_n[0][90] = 2.73e-6;
+      I_off_n[0][100] = 6.1e-6;
+
+      I_g_on_n[0][0]  = 6.55e-8;//A/micron
+      I_g_on_n[0][10] = 6.55e-8;
+      I_g_on_n[0][20] = 6.55e-8;
+      I_g_on_n[0][30] = 6.55e-8;
+      I_g_on_n[0][40] = 6.55e-8;
+      I_g_on_n[0][50] = 6.55e-8;
+      I_g_on_n[0][60] = 6.55e-8;
+      I_g_on_n[0][70] = 6.55e-8;
+      I_g_on_n[0][80] = 6.55e-8;
+      I_g_on_n[0][90] = 6.55e-8;
+      I_g_on_n[0][100] = 6.55e-8;
+
+//      32 DG
+//      I_g_on_n[0][0]  = 2.71e-9;//A/micron
+//      I_g_on_n[0][10] = 2.71e-9;
+//      I_g_on_n[0][20] = 2.71e-9;
+//      I_g_on_n[0][30] = 2.71e-9;
+//      I_g_on_n[0][40] = 2.71e-9;
+//      I_g_on_n[0][50] = 2.71e-9;
+//      I_g_on_n[0][60] = 2.71e-9;
+//      I_g_on_n[0][70] = 2.71e-9;
+//      I_g_on_n[0][80] = 2.71e-9;
+//      I_g_on_n[0][90] = 2.71e-9;
+//      I_g_on_n[0][100] = 2.71e-9;
+
+      //LSTP device type
+      vdd[1] = 1;
+      Lphy[1] = 0.020;
+      Lelec[1] = 0.0173;
+      t_ox[1] = 1.2e-3;
+      v_th[1] = 0.513;
+      c_ox[1] = 2.29e-14;
+      mobility_eff[1] =  347.46 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 8.64e-2;
+      c_g_ideal[1] = 4.58e-16;
+      c_fringe[1] = 0.053e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 683.6e-6;
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/1.93;
+      I_off_n[1][0] = 2.06e-11;
+      I_off_n[1][10] = 3.30e-11;
+      I_off_n[1][20] = 5.15e-11;
+      I_off_n[1][30] = 7.83e-11;
+      I_off_n[1][40] = 1.16e-10;
+      I_off_n[1][50] = 1.69e-10;
+      I_off_n[1][60] = 2.40e-10;
+      I_off_n[1][70] = 3.34e-10;
+      I_off_n[1][80] = 4.54e-10;
+      I_off_n[1][90] = 5.96e-10;
+      I_off_n[1][100] = 7.44e-10;
+
+      I_g_on_n[1][0]  = 3.73e-11;//A/micron
+      I_g_on_n[1][10] = 3.73e-11;
+      I_g_on_n[1][20] = 3.73e-11;
+      I_g_on_n[1][30] = 3.73e-11;
+      I_g_on_n[1][40] = 3.73e-11;
+      I_g_on_n[1][50] = 3.73e-11;
+      I_g_on_n[1][60] = 3.73e-11;
+      I_g_on_n[1][70] = 3.73e-11;
+      I_g_on_n[1][80] = 3.73e-11;
+      I_g_on_n[1][90] = 3.73e-11;
+      I_g_on_n[1][100] = 3.73e-11;
+
+
+      //LOP device type
+      vdd[2] = 0.6;
+      Lphy[2] = 0.016;
+      Lelec[2] = 0.01232;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.24227;
+      c_ox[2] = 2.84e-14;
+      mobility_eff[2] =  513.52 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 4.64e-2;
+      c_g_ideal[2] = 4.54e-16;
+      c_fringe[2] = 0.057e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 827.8e-6;
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.73;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/1.89;
+      I_off_n[2][0] = 5.94e-8;
+      I_off_n[2][10] = 7.23e-8;
+      I_off_n[2][20] = 8.7e-8;
+      I_off_n[2][30] = 1.04e-7;
+      I_off_n[2][40] = 1.22e-7;
+      I_off_n[2][50] = 1.43e-7;
+      I_off_n[2][60] = 1.65e-7;
+      I_off_n[2][70] = 1.90e-7;
+      I_off_n[2][80] = 2.15e-7;
+      I_off_n[2][90] = 2.39e-7;
+      I_off_n[2][100] = 2.63e-7;
+
+      I_g_on_n[2][0]  = 2.93e-9;//A/micron
+      I_g_on_n[2][10] = 2.93e-9;
+      I_g_on_n[2][20] = 2.93e-9;
+      I_g_on_n[2][30] = 2.93e-9;
+      I_g_on_n[2][40] = 2.93e-9;
+      I_g_on_n[2][50] = 2.93e-9;
+      I_g_on_n[2][60] = 2.93e-9;
+      I_g_on_n[2][70] = 2.93e-9;
+      I_g_on_n[2][80] = 2.93e-9;
+      I_g_on_n[2][90] = 2.93e-9;
+      I_g_on_n[2][100] = 2.93e-9;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.056;
+        Lelec[3] = 0.0419;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44129;
+        width_dram_access_transistor = 0.056;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 18.9e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2e-3;
+        v_th[3] = 0.44467;
+        c_ox[3] = 1.48e-14;
+        mobility_eff[3] =  408.12 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.174;
+        c_g_ideal[3] = 7.45e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1055.4e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 3.57e-11;
+        I_off_n[3][10] = 5.51e-11;
+        I_off_n[3][20] = 8.27e-11;
+        I_off_n[3][30] = 1.21e-10;
+        I_off_n[3][40] = 1.74e-10;
+        I_off_n[3][50] = 2.45e-10;
+        I_off_n[3][60] = 3.38e-10;
+        I_off_n[3][70] = 4.53e-10;
+        I_off_n[3][80] = 5.87e-10;
+        I_off_n[3][90] = 7.29e-10;
+        I_off_n[3][100] = 8.87e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.032;
+        Lelec[3] = 0.0205;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.032;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.032*0.032;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.6;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.99e-15;
+        mobility_eff[3] =  380.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.129;
+        c_g_ideal[3] = 2.56e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1024.5e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 3.63e-14;
+        I_off_n[3][10] = 7.18e-14;
+        I_off_n[3][20] = 1.36e-13;
+        I_off_n[3][30] = 2.49e-13;
+        I_off_n[3][40] = 4.41e-13;
+        I_off_n[3][50] = 7.55e-13;
+        I_off_n[3][60] = 1.26e-12;
+        I_off_n[3][70] = 2.03e-12;
+        I_off_n[3][80] = 3.19e-12;
+        I_off_n[3][90] = 4.87e-12;
+        I_off_n[3][100] = 7.16e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7*0.7*0.7;
+      curr_core_tx_density      = 1.25/0.7;
+      curr_sckt_co_eff           = 1.1111;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if(tech == 22){
+        //For 2016, MPU/ASIC stagger-contacted M1 half-pitch is 22 nm (so this is 22 nm
+        //technology i.e. FEATURESIZE = 0.022). Using the DG process numbers for HP.
+        //22 nm HP
+        vdd[0] = 0.8;
+        Lphy[0] = 0.009;//Lphy is the physical gate-length.
+        Lelec[0] = 0.00468;//Lelec is the electrical gate-length.
+        t_ox[0] = 0.55e-3;//micron
+        v_th[0] = 0.1395;//V
+        c_ox[0] = 3.63e-14;//F/micron2
+        mobility_eff[0] = 426.07 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[0] = 2.33e-2; //V/micron
+        c_g_ideal[0] = 3.27e-16;//F/micron
+        c_fringe[0] = 0.06e-15;//F/micron
+        c_junc[0] = 0;//F/micron2
+        I_on_n[0] =  2626.4e-6;//A/micron
+        I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier = 1.45;
+        n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+        //"Dynamic" tab of Device workspace.
+        gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value.
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+        long_channel_leakage_reduction[0] = 1/3.274;
+        I_off_n[0][0] = 1.52e-7/1.5*1.2;//From 22nm, leakage current are directly from ITRS report rather than MASTAR, since MASTAR has serious bugs there.
+        I_off_n[0][10] = 1.55e-7/1.5*1.2;
+        I_off_n[0][20] = 1.59e-7/1.5*1.2;
+        I_off_n[0][30] = 1.68e-7/1.5*1.2;
+        I_off_n[0][40] = 1.90e-7/1.5*1.2;
+        I_off_n[0][50] = 2.69e-7/1.5*1.2;
+        I_off_n[0][60] = 5.32e-7/1.5*1.2;
+        I_off_n[0][70] = 1.02e-6/1.5*1.2;
+        I_off_n[0][80] = 1.62e-6/1.5*1.2;
+        I_off_n[0][90] = 2.73e-6/1.5*1.2;
+        I_off_n[0][100] = 6.1e-6/1.5*1.2;
+        //for 22nm DG HP
+        I_g_on_n[0][0]  = 1.81e-9;//A/micron
+        I_g_on_n[0][10] = 1.81e-9;
+        I_g_on_n[0][20] = 1.81e-9;
+        I_g_on_n[0][30] = 1.81e-9;
+        I_g_on_n[0][40] = 1.81e-9;
+        I_g_on_n[0][50] = 1.81e-9;
+        I_g_on_n[0][60] = 1.81e-9;
+        I_g_on_n[0][70] = 1.81e-9;
+        I_g_on_n[0][80] = 1.81e-9;
+        I_g_on_n[0][90] = 1.81e-9;
+        I_g_on_n[0][100] = 1.81e-9;
+
+        //22 nm LSTP DG
+        vdd[1] = 0.8;
+        Lphy[1] = 0.014;
+        Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+        t_ox[1] = 1.1e-3;//micron
+        v_th[1] = 0.40126;//V
+        c_ox[1] = 2.30e-14;//F/micron2
+        mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[1] = 6.64e-2; //V/micron
+        c_g_ideal[1] = 3.22e-16;//F/micron
+        c_fringe[1] = 0.08e-15;
+        c_junc[1] = 0;//F/micron2
+        I_on_n[1] = 727.6e-6;//A/micron
+        I_on_p[1] = I_on_n[1] / 2;
+        nmos_effective_resistance_multiplier = 1.99;
+        n_to_p_eff_curr_drv_ratio[1] = 2;
+        gmp_to_gmn_multiplier[1] = 0.99;
+        Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron
+        Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron
+        long_channel_leakage_reduction[1] = 1/1.89;
+        I_off_n[1][0] = 2.43e-11;
+        I_off_n[1][10] = 4.85e-11;
+        I_off_n[1][20] = 9.68e-11;
+        I_off_n[1][30] = 1.94e-10;
+        I_off_n[1][40] = 3.87e-10;
+        I_off_n[1][50] = 7.73e-10;
+        I_off_n[1][60] = 3.55e-10;
+        I_off_n[1][70] = 3.09e-9;
+        I_off_n[1][80] = 6.19e-9;
+        I_off_n[1][90] = 1.24e-8;
+        I_off_n[1][100]= 2.48e-8;
+
+        I_g_on_n[1][0]  = 4.51e-10;//A/micron
+        I_g_on_n[1][10] = 4.51e-10;
+        I_g_on_n[1][20] = 4.51e-10;
+        I_g_on_n[1][30] = 4.51e-10;
+        I_g_on_n[1][40] = 4.51e-10;
+        I_g_on_n[1][50] = 4.51e-10;
+        I_g_on_n[1][60] = 4.51e-10;
+        I_g_on_n[1][70] = 4.51e-10;
+        I_g_on_n[1][80] = 4.51e-10;
+        I_g_on_n[1][90] = 4.51e-10;
+        I_g_on_n[1][100] = 4.51e-10;
+
+        //22 nm LOP
+        vdd[2] = 0.6;
+        Lphy[2] = 0.011;
+        Lelec[2] = 0.00604;//Lelec is the electrical gate-length.
+        t_ox[2] = 0.8e-3;//micron
+        v_th[2] = 0.2315;//V
+        c_ox[2] = 2.87e-14;//F/micron2
+        mobility_eff[2] =  698.37 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[2] = 1.81e-2; //V/micron
+        c_g_ideal[2] = 3.16e-16;//F/micron
+        c_fringe[2] = 0.08e-15;
+        c_junc[2] = 0;//F/micron2 This is Cj0 not Cjunc in MASTAR results->Dynamic Tab
+        I_on_n[2] = 916.1e-6;//A/micron
+        I_on_p[2] = I_on_n[2] / 2;
+        nmos_effective_resistance_multiplier = 1.73;
+        n_to_p_eff_curr_drv_ratio[2] = 2;
+        gmp_to_gmn_multiplier[2] = 1.11;
+        Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];//ohm-micron
+        Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];//ohm-micron
+        long_channel_leakage_reduction[2] = 1/2.38;
+
+        I_off_n[2][0] = 1.31e-8;
+        I_off_n[2][10] = 2.60e-8;
+        I_off_n[2][20] = 5.14e-8;
+        I_off_n[2][30] = 1.02e-7;
+        I_off_n[2][40] = 2.02e-7;
+        I_off_n[2][50] = 3.99e-7;
+        I_off_n[2][60] = 7.91e-7;
+        I_off_n[2][70] = 1.09e-6;
+        I_off_n[2][80] = 2.09e-6;
+        I_off_n[2][90] = 4.04e-6;
+        I_off_n[2][100]= 4.48e-6;
+
+        I_g_on_n[2][0]  = 2.74e-9;//A/micron
+        I_g_on_n[2][10] = 2.74e-9;
+        I_g_on_n[2][20] = 2.74e-9;
+        I_g_on_n[2][30] = 2.74e-9;
+        I_g_on_n[2][40] = 2.74e-9;
+        I_g_on_n[2][50] = 2.74e-9;
+        I_g_on_n[2][60] = 2.74e-9;
+        I_g_on_n[2][70] = 2.74e-9;
+        I_g_on_n[2][80] = 2.74e-9;
+        I_g_on_n[2][90] = 2.74e-9;
+        I_g_on_n[2][100] = 2.74e-9;
+
+
+
+        if (ram_cell_tech_type == 3)
+              {}
+        else if (ram_cell_tech_type == 4)
+        {
+        //22 nm commodity DRAM cell access transistor technology parameters.
+                //parameters
+                curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In
+                //2005 ITRS, the value was about twice the value in 2007 ITRS
+                Lphy[3] = 0.022;//micron
+                Lelec[3] = 0.0181;//micron.
+                curr_v_th_dram_access_transistor = 1;//V
+                width_dram_access_transistor = 0.022;//micron
+                curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always
+                //kept constant. In reality this could perhaps be lower
+                curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A
+                curr_Wmemcella_dram = width_dram_access_transistor;
+                curr_Wmemcellpmos_dram = 0;
+                curr_Wmemcellnmos_dram = 0;
+                curr_area_cell_dram = 6*0.022*0.022;//micron2.
+                curr_asp_ratio_cell_dram = 0.667;
+                curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus
+                //kept constant.
+
+        //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR.
+                curr_vpp = 2.3;//vpp. V
+                t_ox[3] = 3.5e-3;//micron
+                v_th[3] = 1.0;//V
+                c_ox[3] = 9.06e-15;//F/micron2
+                mobility_eff[3] =  367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs
+                Vdsat[3] = 0.0972; //V/micron
+                c_g_ideal[3] = 1.99e-16;//F/micron
+                c_fringe[3] = 0.053e-15;//F/micron
+                c_junc[3] = 1e-15;//F/micron2
+                I_on_n[3] = 910.5e-6;//A/micron
+                I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used.
+                nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm.
+                //
+                n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm
+                gmp_to_gmn_multiplier[3] = 0.90;
+                Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp  / I_on_n[3];//ohm-micron
+                Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron
+                long_channel_leakage_reduction[3] = 1;
+                I_off_n[3][0] = 1.1e-13; //A/micron
+                I_off_n[3][10] = 2.11e-13;
+                I_off_n[3][20] = 3.88e-13;
+                I_off_n[3][30] = 6.9e-13;
+                I_off_n[3][40] = 1.19e-12;
+                I_off_n[3][50] = 1.98e-12;
+                I_off_n[3][60] = 3.22e-12;
+                I_off_n[3][70] = 5.09e-12;
+                I_off_n[3][80] = 7.85e-12;
+                I_off_n[3][90] = 1.18e-11;
+                I_off_n[3][100] = 1.72e-11;
+
+        }
+        else
+        {
+          //some error handler
+        }
+
+        //SRAM cell properties
+        curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_sram = 1.46;
+        //CAM cell properties //TODO: data need to be revisited
+        curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_cam = 2.92;
+        //Empirical undifferetiated core/FU coefficient
+        curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7;
+        curr_core_tx_density      = 1.25/0.7/0.7;
+        curr_sckt_co_eff           = 1.1296;
+        curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+        curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+        }
+
+    if(tech == 16){
+        //For 2019, MPU/ASIC stagger-contacted M1 half-pitch is 16 nm (so this is 16 nm
+        //technology i.e. FEATURESIZE = 0.016). Using the DG process numbers for HP.
+        //16 nm HP
+        vdd[0] = 0.7;
+        Lphy[0] = 0.006;//Lphy is the physical gate-length.
+        Lelec[0] = 0.00315;//Lelec is the electrical gate-length.
+        t_ox[0] = 0.5e-3;//micron
+        v_th[0] = 0.1489;//V
+        c_ox[0] = 3.83e-14;//F/micron2 Cox_elec in MASTAR
+        mobility_eff[0] = 476.15 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+        Vdsat[0] = 1.42e-2; //V/micron calculated in spreadsheet
+        c_g_ideal[0] = 2.30e-16;//F/micron
+        c_fringe[0] = 0.06e-15;//F/micron MASTAR inputdynamic/3
+        c_junc[0] = 0;//F/micron2 MASTAR result dynamic
+        I_on_n[0] =  2768.4e-6;//A/micron
+        I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier = 1.48;//nmos_effective_resistance_multiplier  is the ratio of Ieff to Idsat where Ieff is the effective NMOS current and Idsat is the saturation current.
+        n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+        //"Dynamic" tab of Device workspace.
+        gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value.
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+        long_channel_leakage_reduction[0] = 1/2.655;
+        I_off_n[0][0] = 1.52e-7/1.5*1.2*1.07;
+        I_off_n[0][10] = 1.55e-7/1.5*1.2*1.07;
+        I_off_n[0][20] = 1.59e-7/1.5*1.2*1.07;
+        I_off_n[0][30] = 1.68e-7/1.5*1.2*1.07;
+        I_off_n[0][40] = 1.90e-7/1.5*1.2*1.07;
+        I_off_n[0][50] = 2.69e-7/1.5*1.2*1.07;
+        I_off_n[0][60] = 5.32e-7/1.5*1.2*1.07;
+        I_off_n[0][70] = 1.02e-6/1.5*1.2*1.07;
+        I_off_n[0][80] = 1.62e-6/1.5*1.2*1.07;
+        I_off_n[0][90] = 2.73e-6/1.5*1.2*1.07;
+        I_off_n[0][100] = 6.1e-6/1.5*1.2*1.07;
+        //for 16nm DG HP
+        I_g_on_n[0][0]  = 1.07e-9;//A/micron
+        I_g_on_n[0][10] = 1.07e-9;
+        I_g_on_n[0][20] = 1.07e-9;
+        I_g_on_n[0][30] = 1.07e-9;
+        I_g_on_n[0][40] = 1.07e-9;
+        I_g_on_n[0][50] = 1.07e-9;
+        I_g_on_n[0][60] = 1.07e-9;
+        I_g_on_n[0][70] = 1.07e-9;
+        I_g_on_n[0][80] = 1.07e-9;
+        I_g_on_n[0][90] = 1.07e-9;
+        I_g_on_n[0][100] = 1.07e-9;
+
+//     //16 nm LSTP DG
+//     vdd[1] = 0.8;
+//     Lphy[1] = 0.014;
+//     Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+//     t_ox[1] = 1.1e-3;//micron
+//     v_th[1] = 0.40126;//V
+//     c_ox[1] = 2.30e-14;//F/micron2
+//     mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+//     Vdsat[1] = 6.64e-2; //V/micron
+//     c_g_ideal[1] = 3.22e-16;//F/micron
+//     c_fringe[1] = 0.008e-15;
+//     c_junc[1] = 0;//F/micron2
+//     I_on_n[1] = 727.6e-6;//A/micron
+//     I_on_p[1] = I_on_n[1] / 2;
+//     nmos_effective_resistance_multiplier = 1.99;
+//     n_to_p_eff_curr_drv_ratio[1] = 2;
+//     gmp_to_gmn_multiplier[1] = 0.99;
+//     Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron
+//     Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron
+//     I_off_n[1][0] = 2.43e-11;
+//     I_off_n[1][10] = 4.85e-11;
+//     I_off_n[1][20] = 9.68e-11;
+//     I_off_n[1][30] = 1.94e-10;
+//     I_off_n[1][40] = 3.87e-10;
+//     I_off_n[1][50] = 7.73e-10;
+//     I_off_n[1][60] = 3.55e-10;
+//     I_off_n[1][70] = 3.09e-9;
+//     I_off_n[1][80] = 6.19e-9;
+//     I_off_n[1][90] = 1.24e-8;
+//     I_off_n[1][100]= 2.48e-8;
+//
+//     //    for 22nm LSTP HP
+//     I_g_on_n[1][0]  = 4.51e-10;//A/micron
+//     I_g_on_n[1][10] = 4.51e-10;
+//     I_g_on_n[1][20] = 4.51e-10;
+//     I_g_on_n[1][30] = 4.51e-10;
+//     I_g_on_n[1][40] = 4.51e-10;
+//     I_g_on_n[1][50] = 4.51e-10;
+//     I_g_on_n[1][60] = 4.51e-10;
+//     I_g_on_n[1][70] = 4.51e-10;
+//     I_g_on_n[1][80] = 4.51e-10;
+//     I_g_on_n[1][90] = 4.51e-10;
+//     I_g_on_n[1][100] = 4.51e-10;
+
+
+        if (ram_cell_tech_type == 3)
+              {}
+        else if (ram_cell_tech_type == 4)
+        {
+        //22 nm commodity DRAM cell access transistor technology parameters.
+                //parameters
+                curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In
+                //2005 ITRS, the value was about twice the value in 2007 ITRS
+                Lphy[3] = 0.022;//micron
+                Lelec[3] = 0.0181;//micron.
+                curr_v_th_dram_access_transistor = 1;//V
+                width_dram_access_transistor = 0.022;//micron
+                curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always
+                //kept constant. In reality this could perhaps be lower
+                curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A
+                curr_Wmemcella_dram = width_dram_access_transistor;
+                curr_Wmemcellpmos_dram = 0;
+                curr_Wmemcellnmos_dram = 0;
+                curr_area_cell_dram = 6*0.022*0.022;//micron2.
+                curr_asp_ratio_cell_dram = 0.667;
+                curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus
+                //kept constant.
+
+        //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR.
+                curr_vpp = 2.3;//vpp. V
+                t_ox[3] = 3.5e-3;//micron
+                v_th[3] = 1.0;//V
+                c_ox[3] = 9.06e-15;//F/micron2
+                mobility_eff[3] =  367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs
+                Vdsat[3] = 0.0972; //V/micron
+                c_g_ideal[3] = 1.99e-16;//F/micron
+                c_fringe[3] = 0.053e-15;//F/micron
+                c_junc[3] = 1e-15;//F/micron2
+                I_on_n[3] = 910.5e-6;//A/micron
+                I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used.
+                nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm.
+                //
+                n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm
+                gmp_to_gmn_multiplier[3] = 0.90;
+                Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp  / I_on_n[3];//ohm-micron
+                Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron
+                long_channel_leakage_reduction[3] = 1;
+                I_off_n[3][0] = 1.1e-13; //A/micron
+                I_off_n[3][10] = 2.11e-13;
+                I_off_n[3][20] = 3.88e-13;
+                I_off_n[3][30] = 6.9e-13;
+                I_off_n[3][40] = 1.19e-12;
+                I_off_n[3][50] = 1.98e-12;
+                I_off_n[3][60] = 3.22e-12;
+                I_off_n[3][70] = 5.09e-12;
+                I_off_n[3][80] = 7.85e-12;
+                I_off_n[3][90] = 1.18e-11;
+                I_off_n[3][100] = 1.72e-11;
+
+        }
+        else
+        {
+          //some error handler
+        }
+
+        //SRAM cell properties
+        curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_sram = 1.46;
+        //CAM cell properties //TODO: data need to be revisited
+        curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_cam = 2.92;
+        //Empirical undifferetiated core/FU coefficient
+        curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7*0.7;
+        curr_core_tx_density      = 1.25/0.7/0.7/0.7;
+        curr_sckt_co_eff           = 1.1296;
+        curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+        curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+        }
+
+
+    g_tp.peri_global.Vdd       += curr_alpha * vdd[peri_global_tech_type];
+    g_tp.peri_global.t_ox      += curr_alpha * t_ox[peri_global_tech_type];
+    g_tp.peri_global.Vth       += curr_alpha * v_th[peri_global_tech_type];
+    g_tp.peri_global.C_ox      += curr_alpha * c_ox[peri_global_tech_type];
+    g_tp.peri_global.C_g_ideal += curr_alpha * c_g_ideal[peri_global_tech_type];
+    g_tp.peri_global.C_fringe  += curr_alpha * c_fringe[peri_global_tech_type];
+    g_tp.peri_global.C_junc    += curr_alpha * c_junc[peri_global_tech_type];
+    g_tp.peri_global.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.peri_global.l_phy     += curr_alpha * Lphy[peri_global_tech_type];
+    g_tp.peri_global.l_elec    += curr_alpha * Lelec[peri_global_tech_type];
+    g_tp.peri_global.I_on_n    += curr_alpha * I_on_n[peri_global_tech_type];
+    g_tp.peri_global.R_nch_on  += curr_alpha * Rnchannelon[peri_global_tech_type];
+    g_tp.peri_global.R_pch_on  += curr_alpha * Rpchannelon[peri_global_tech_type];
+    g_tp.peri_global.n_to_p_eff_curr_drv_ratio
+      += curr_alpha * n_to_p_eff_curr_drv_ratio[peri_global_tech_type];
+    g_tp.peri_global.long_channel_leakage_reduction
+      += curr_alpha * long_channel_leakage_reduction[peri_global_tech_type];
+    g_tp.peri_global.I_off_n   += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_off_p   += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_n   += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_p   += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    gmp_to_gmn_multiplier_periph_global += curr_alpha * gmp_to_gmn_multiplier[peri_global_tech_type];
+
+    g_tp.sram_cell.Vdd       += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.sram_cell.l_phy     += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.sram_cell.l_elec    += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.sram_cell.t_ox      += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.sram_cell.Vth       += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.sram_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.sram_cell.C_fringe  += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc    += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.sram_cell.I_on_n    += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.sram_cell.R_nch_on  += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.R_pch_on  += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.sram_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.sram_cell.I_off_n   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_off_p   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_n   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_p   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram_cell_Vdd      += curr_alpha * curr_vdd_dram_cell;
+    g_tp.dram_acc.Vth       += curr_alpha * curr_v_th_dram_access_transistor;
+    g_tp.dram_acc.l_phy     += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_acc.l_elec    += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_fringe  += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc    += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.dram_cell_I_on     += curr_alpha * curr_I_on_dram_cell;
+    g_tp.dram_cell_I_off_worst_case_len_temp += curr_alpha * curr_I_off_dram_cell_worst_case_length_temp;
+    g_tp.dram_acc.I_on_n    += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_cell_C        += curr_alpha * curr_c_dram_cell;
+    g_tp.vpp                += curr_alpha * curr_vpp;
+    g_tp.dram_wl.l_phy      += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_wl.l_elec     += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_g_ideal  += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_fringe   += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc     += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.dram_wl.I_on_n     += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_nch_on   += curr_alpha * Rnchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_pch_on   += curr_alpha * Rpchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[dram_cell_tech_flavor];
+    g_tp.dram_wl.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[dram_cell_tech_flavor];
+    g_tp.dram_wl.I_off_n    += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+    g_tp.dram_wl.I_off_p    += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+
+    g_tp.cam_cell.Vdd       += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.cam_cell.l_phy     += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.cam_cell.l_elec    += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.cam_cell.t_ox      += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.cam_cell.Vth       += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.cam_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.cam_cell.C_fringe  += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc    += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.cam_cell.I_on_n    += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.cam_cell.R_nch_on  += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.R_pch_on  += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.cam_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.cam_cell.I_off_n   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_off_p   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_n   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_p   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram.cell_a_w    += curr_alpha * curr_Wmemcella_dram;
+    g_tp.dram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_dram;
+    g_tp.dram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_dram;
+    area_cell_dram        += curr_alpha * curr_area_cell_dram;
+    asp_ratio_cell_dram   += curr_alpha * curr_asp_ratio_cell_dram;
+
+    g_tp.sram.cell_a_w    += curr_alpha * curr_Wmemcella_sram;
+    g_tp.sram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_sram;
+    g_tp.sram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_sram;
+    area_cell_sram += curr_alpha * curr_area_cell_sram;
+    asp_ratio_cell_sram += curr_alpha * curr_asp_ratio_cell_sram;
+
+    g_tp.cam.cell_a_w    += curr_alpha * curr_Wmemcella_cam;//sheng
+    g_tp.cam.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_cam;
+    g_tp.cam.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_cam;
+    area_cell_cam += curr_alpha * curr_area_cell_cam;
+    asp_ratio_cell_cam += curr_alpha * curr_asp_ratio_cell_cam;
+
+    //Sense amplifier latch Gm calculation
+    mobility_eff_periph_global += curr_alpha * mobility_eff[peri_global_tech_type];
+    Vdsat_periph_global += curr_alpha * Vdsat[peri_global_tech_type];
+
+    //Empirical undifferetiated core/FU coefficient
+    g_tp.scaling_factor.logic_scaling_co_eff += curr_alpha * curr_logic_scaling_co_eff;
+    g_tp.scaling_factor.core_tx_density += curr_alpha * curr_core_tx_density;
+    g_tp.chip_layout_overhead  += curr_alpha * curr_chip_layout_overhead;
+    g_tp.macro_layout_overhead += curr_alpha * curr_macro_layout_overhead;
+    g_tp.sckt_co_eff           += curr_alpha * curr_sckt_co_eff;
+  }
+
+
+  //Currently we are not modeling the resistance/capacitance of poly anywhere.
+  //Continuous function (or date have been processed) does not need linear interpolation
+  g_tp.w_comp_inv_p1 = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n1 =  7.5 * g_ip->F_sz_um;//this was  6 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p2 =   25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n2 =   15 * g_ip->F_sz_um;//this was 12 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p3 =   50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n3 =   30 * g_ip->F_sz_um;//this was 24 micron for the 0.8 micron process
+  g_tp.w_eval_inv_p  =  100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process
+  g_tp.w_eval_inv_n  =   50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_n     = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_p     = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process
+
+  g_tp.MIN_GAP_BET_P_AND_N_DIFFS = 5 * g_ip->F_sz_um;
+  g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS = 1.5 * g_ip->F_sz_um;
+  g_tp.HPOWERRAIL = 2 * g_ip->F_sz_um;
+  g_tp.cell_h_def = 50 * g_ip->F_sz_um;
+  g_tp.w_poly_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_poly = 1.5 * g_ip->F_sz_um;
+  g_tp.ram_wl_stitching_overhead_ = 7.5 * g_ip->F_sz_um;
+
+  g_tp.min_w_nmos_ = 3 * g_ip->F_sz_um / 2;
+  g_tp.max_w_nmos_ = 100 * g_ip->F_sz_um;
+  g_tp.w_iso       = 12.5*g_ip->F_sz_um;//was 10 micron for the 0.8 micron process
+  g_tp.w_sense_n   = 3.75*g_ip->F_sz_um; // sense amplifier N-trans; was 3 micron for the 0.8 micron process
+  g_tp.w_sense_p   = 7.5*g_ip->F_sz_um; // sense amplifier P-trans; was 6 micron for the 0.8 micron process
+  g_tp.w_sense_en  = 5*g_ip->F_sz_um; // Sense enable transistor of the sense amplifier; was 4 micron for the 0.8 micron process
+  g_tp.w_nmos_b_mux  = 6 * g_tp.min_w_nmos_;
+  g_tp.w_nmos_sa_mux = 6 * g_tp.min_w_nmos_;
+
+  if (ram_cell_tech_type == comm_dram)
+  {
+    g_tp.max_w_nmos_dec = 8 * g_ip->F_sz_um;
+    g_tp.h_dec          = 8;  // in the unit of memory cell height
+  }
+  else
+  {
+    g_tp.max_w_nmos_dec = g_tp.max_w_nmos_;
+    g_tp.h_dec          = 4;  // in the unit of memory cell height
+  }
+
+  g_tp.peri_global.C_overlap = 0.2 * g_tp.peri_global.C_g_ideal;
+  g_tp.sram_cell.C_overlap   = 0.2 * g_tp.sram_cell.C_g_ideal;
+  g_tp.cam_cell.C_overlap    = 0.2 * g_tp.cam_cell.C_g_ideal;
+
+  g_tp.dram_acc.C_overlap = 0.2 * g_tp.dram_acc.C_g_ideal;
+  g_tp.dram_acc.R_nch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_n;
+  //g_tp.dram_acc.R_pch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_p;
+
+  g_tp.dram_wl.C_overlap = 0.2 * g_tp.dram_wl.C_g_ideal;
+
+  double gmn_sense_amp_latch = (mobility_eff_periph_global / 2) * g_tp.peri_global.C_ox * (g_tp.w_sense_n / g_tp.peri_global.l_elec) * Vdsat_periph_global;
+  double gmp_sense_amp_latch = gmp_to_gmn_multiplier_periph_global * gmn_sense_amp_latch;
+  g_tp.gm_sense_amp_latch = gmn_sense_amp_latch + gmp_sense_amp_latch;
+
+  g_tp.dram.b_w = sqrt(area_cell_dram / (asp_ratio_cell_dram));
+  g_tp.dram.b_h = asp_ratio_cell_dram * g_tp.dram.b_w;
+  g_tp.sram.b_w = sqrt(area_cell_sram / (asp_ratio_cell_sram));
+  g_tp.sram.b_h = asp_ratio_cell_sram * g_tp.sram.b_w;
+  g_tp.cam.b_w =  sqrt(area_cell_cam / (asp_ratio_cell_cam));//Sheng
+  g_tp.cam.b_h = asp_ratio_cell_cam * g_tp.cam.b_w;
+
+  g_tp.dram.Vbitpre = g_tp.dram_cell_Vdd;
+  g_tp.sram.Vbitpre = vdd[ram_cell_tech_type];
+  g_tp.cam.Vbitpre = vdd[ram_cell_tech_type];//Sheng
+  pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  g_tp.w_pmos_bl_precharge = 6 * pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+  g_tp.w_pmos_bl_eq = pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+
+
+  double wire_pitch       [NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         wire_r_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         wire_c_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         horiz_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         vert_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         aspect_ratio[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         miller_value[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         ild_thickness[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES];
+
+  for (iter=0; iter<=1; ++iter)
+  {
+    // linear interpolation
+    if (iter == 0)
+    {
+      tech = tech_lo;
+      if (tech_lo == tech_hi)
+      {
+        curr_alpha = 1;
+      }
+      else
+      {
+        curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi);
+      }
+    }
+    else
+    {
+      tech = tech_hi;
+      if (tech_lo == tech_hi)
+      {
+        break;
+      }
+      else
+      {
+        curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 90)
+    {
+      //Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron
+      aspect_ratio[0][0] = 2.4;
+      wire_width = wire_pitch[0][0] / 2; //micron
+      wire_thickness = aspect_ratio[0][0] * wire_width;//micron
+      wire_spacing = wire_pitch[0][0] - wire_width;//micron
+      barrier_thickness = 0.01;//micron
+      dishing_thickness = 0;//micron
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron
+      ild_thickness[0][0] = 0.48;//micron
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 2.709;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15; //F/micron
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0],
+          fringe_cap);//F/micron.
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 2.4;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.48;//micron
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 2.709;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.7;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.96;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 2.709;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0]  = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.008;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0]  = 0.48;
+      miller_value[1][0]  = 1.5;
+      horiz_dielectric_constant[1][0]  = 3.038;
+      vert_dielectric_constant[1][0]  = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1]  = 0.48;
+      miller_value[1][1]  = 1.5;
+      horiz_dielectric_constant[1][1]  = 3.038;
+      vert_dielectric_constant[1][1]  = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2]  = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2]  = 1.1;
+      miller_value[1][2]  = 1.5;
+      horiz_dielectric_constant[1][2]  = 3.038;
+      vert_dielectric_constant[1][2]  = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.09;
+      wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.09);
+      wire_r_per_micron[1][3] = 12 / 0.09;
+    }
+    else if (tech == 65)
+    {
+      //Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0]  = 2.7;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0]  * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0]  = 0.405;
+      miller_value[0][0]   = 1.5;
+      horiz_dielectric_constant[0][0]  = 2.303;
+      vert_dielectric_constant[0][0]   = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] ,
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1]  = 2.7;
+      wire_thickness = aspect_ratio[0][1]  * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1]  = 0.405;
+      miller_value[0][1]   = 1.5;
+      horiz_dielectric_constant[0][1]  = 2.303;
+      vert_dielectric_constant[0][1]   = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.8;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.81;
+      miller_value[0][2]   = 1.5;
+      horiz_dielectric_constant[0][2]  = 2.303;
+      vert_dielectric_constant[0][2]   = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.006;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.405;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.734;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.405;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.734;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.77;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.734;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.065;
+      wire_c_per_micron[1][3] = 52.5e-15 / (256 * 2 * 0.065);
+      wire_r_per_micron[1][3] = 12 / 0.065;
+    }
+    else if (tech == 45)
+    {
+      //Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0]  = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0]  * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0]  = 0.315;
+      miller_value[0][0]  = 1.5;
+      horiz_dielectric_constant[0][0]  = 1.958;
+      vert_dielectric_constant[0][0]  = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] ,
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1]  = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1]  = 0.315;
+      miller_value[0][1]  = 1.5;
+      horiz_dielectric_constant[0][1]  = 1.958;
+      vert_dielectric_constant[0][1]  = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.63;
+      miller_value[0][2]  = 1.5;
+      horiz_dielectric_constant[0][2]  = 1.958;
+      vert_dielectric_constant[0][2]  = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.004;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.315;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.46;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.315;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.46;
+      vert_dielectric_constant[1][1] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.55;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.46;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.045;
+      wire_c_per_micron[1][3] = 37.5e-15 / (256 * 2 * 0.045);
+      wire_r_per_micron[1][3] = 12 / 0.045;
+    }
+    else if (tech == 32)
+    {
+      //Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.21;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.664;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.21;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.664;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.42;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.664;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.003;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.21;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.214;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      aspect_ratio[1][1] = 2.0;
+      wire_width = wire_pitch[1][1] / 2;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.21;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.214;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.385;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.214;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.032;//micron
+      wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.032);//F/micron
+      wire_r_per_micron[1][3] = 12 / 0.032;//ohm/micron
+    }
+    else if (tech == 22)
+        {
+          //Aggressive projections.
+          wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local
+          aspect_ratio[0][0] = 3.0;
+          wire_width = wire_pitch[0][0] / 2;
+          wire_thickness = aspect_ratio[0][0] * wire_width;
+          wire_spacing = wire_pitch[0][0] - wire_width;
+          barrier_thickness = 0;
+          dishing_thickness = 0;
+          alpha_scatter = 1;
+          wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][0] = 0.15;
+          miller_value[0][0] = 1.5;
+          horiz_dielectric_constant[0][0] = 1.414;
+          vert_dielectric_constant[0][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+            fringe_cap);
+
+          wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global
+          wire_width = wire_pitch[0][1] / 2;
+          aspect_ratio[0][1] = 3.0;
+          wire_thickness = aspect_ratio[0][1] * wire_width;
+          wire_spacing = wire_pitch[0][1] - wire_width;
+          wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][1] = 0.15;
+          miller_value[0][1] = 1.5;
+          horiz_dielectric_constant[0][1] = 1.414;
+          vert_dielectric_constant[0][1] = 3.9;
+          wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+            fringe_cap);
+
+          wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global
+          aspect_ratio[0][2] = 3.0;
+          wire_width = wire_pitch[0][2] / 2;
+          wire_thickness = aspect_ratio[0][2] * wire_width;
+          wire_spacing = wire_pitch[0][2] - wire_width;
+          wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+                          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][2] = 0.3;
+          miller_value[0][2] = 1.5;
+          horiz_dielectric_constant[0][2] = 1.414;
+          vert_dielectric_constant[0][2] = 3.9;
+          wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+                          fringe_cap);
+
+//          //*************************
+//          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][4] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][4] - wire_width;
+//          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][5] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][5] - wire_width;
+//          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][6] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][6] - wire_width;
+//          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+          //*************************
+
+          //Conservative projections
+          wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+          aspect_ratio[1][0] = 2.0;
+          wire_width = wire_pitch[1][0] / 2;
+          wire_thickness = aspect_ratio[1][0] * wire_width;
+          wire_spacing = wire_pitch[1][0] - wire_width;
+          barrier_thickness = 0.003;
+          dishing_thickness = 0;
+          alpha_scatter = 1.05;
+          wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][0] = 0.15;
+          miller_value[1][0] = 1.5;
+          horiz_dielectric_constant[1][0] = 2.104;
+          vert_dielectric_constant[1][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+            fringe_cap);
+
+          wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+          wire_width = wire_pitch[1][1] / 2;
+          aspect_ratio[1][1] = 2.0;
+          wire_thickness = aspect_ratio[1][1] * wire_width;
+          wire_spacing = wire_pitch[1][1] - wire_width;
+          wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][1] = 0.15;
+          miller_value[1][1] = 1.5;
+          horiz_dielectric_constant[1][1] = 2.104;
+          vert_dielectric_constant[1][1] = 3.9;
+          wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+            fringe_cap);
+
+            wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+            aspect_ratio[1][2] = 2.2;
+            wire_width = wire_pitch[1][2] / 2;
+            wire_thickness = aspect_ratio[1][2] * wire_width;
+            wire_spacing = wire_pitch[1][2] - wire_width;
+            dishing_thickness = 0.1 *  wire_thickness;
+            wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+            ild_thickness[1][2] = 0.275;
+            miller_value[1][2] = 1.5;
+            horiz_dielectric_constant[1][2] = 2.104;
+            vert_dielectric_constant[1][2] = 3.9;
+            wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                        ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+                        fringe_cap);
+            //Nominal projections for commodity DRAM wordline/bitline
+            wire_pitch[1][3] = 2 * 0.022;//micron
+            wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.022);//F/micron
+            wire_r_per_micron[1][3] = 12 / 0.022;//ohm/micron
+
+            //******************
+//            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][4] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][4] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][5] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][5] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][6] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][6] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+        }
+
+    else if (tech == 16)
+        {
+          //Aggressive projections.
+          wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local
+          aspect_ratio[0][0] = 3.0;
+          wire_width = wire_pitch[0][0] / 2;
+          wire_thickness = aspect_ratio[0][0] * wire_width;
+          wire_spacing = wire_pitch[0][0] - wire_width;
+          barrier_thickness = 0;
+          dishing_thickness = 0;
+          alpha_scatter = 1;
+          wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][0] = 0.108;
+          miller_value[0][0] = 1.5;
+          horiz_dielectric_constant[0][0] = 1.202;
+          vert_dielectric_constant[0][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+            fringe_cap);
+
+          wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global
+          aspect_ratio[0][1] = 3.0;
+          wire_width = wire_pitch[0][1] / 2;
+          wire_thickness = aspect_ratio[0][1] * wire_width;
+          wire_spacing = wire_pitch[0][1] - wire_width;
+          wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][1] = 0.108;
+          miller_value[0][1] = 1.5;
+          horiz_dielectric_constant[0][1] = 1.202;
+          vert_dielectric_constant[0][1] = 3.9;
+          wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+            fringe_cap);
+
+          wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global
+          aspect_ratio[0][2] = 3.0;
+          wire_width = wire_pitch[0][2] / 2;
+          wire_thickness = aspect_ratio[0][2] * wire_width;
+          wire_spacing = wire_pitch[0][2] - wire_width;
+          wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+                          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][2] = 0.216;
+          miller_value[0][2] = 1.5;
+          horiz_dielectric_constant[0][2] = 1.202;
+          vert_dielectric_constant[0][2] = 3.9;
+          wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+                          fringe_cap);
+
+//          //*************************
+//          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][4] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][4] - wire_width;
+//          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][5] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][5] - wire_width;
+//          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+//
+//          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][6] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][6] - wire_width;
+//          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//                       wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                       ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                       fringe_cap);
+          //*************************
+
+          //Conservative projections
+          wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+          aspect_ratio[1][0] = 2.0;
+          wire_width = wire_pitch[1][0] / 2;
+          wire_thickness = aspect_ratio[1][0] * wire_width;
+          wire_spacing = wire_pitch[1][0] - wire_width;
+          barrier_thickness = 0.002;
+          dishing_thickness = 0;
+          alpha_scatter = 1.05;
+          wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][0] = 0.108;
+          miller_value[1][0] = 1.5;
+          horiz_dielectric_constant[1][0] = 1.998;
+          vert_dielectric_constant[1][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+            fringe_cap);
+
+          wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+          wire_width = wire_pitch[1][1] / 2;
+          aspect_ratio[1][1] = 2.0;
+          wire_thickness = aspect_ratio[1][1] * wire_width;
+          wire_spacing = wire_pitch[1][1] - wire_width;
+          wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][1] = 0.108;
+          miller_value[1][1] = 1.5;
+          horiz_dielectric_constant[1][1] = 1.998;
+          vert_dielectric_constant[1][1] = 3.9;
+            wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+            fringe_cap);
+
+            wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+            aspect_ratio[1][2] = 2.2;
+            wire_width = wire_pitch[1][2] / 2;
+            wire_thickness = aspect_ratio[1][2] * wire_width;
+            wire_spacing = wire_pitch[1][2] - wire_width;
+            dishing_thickness = 0.1 *  wire_thickness;
+            wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+                        wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+            ild_thickness[1][2] = 0.198;
+            miller_value[1][2] = 1.5;
+            horiz_dielectric_constant[1][2] = 1.998;
+            vert_dielectric_constant[1][2] = 3.9;
+            wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+                        ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+                        fringe_cap);
+            //Nominal projections for commodity DRAM wordline/bitline
+            wire_pitch[1][3] = 2 * 0.016;//micron
+            wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.016);//F/micron
+            wire_r_per_micron[1][3] = 12 / 0.016;//ohm/micron
+
+            //******************
+//            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][4] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][4] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][5] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][5] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+//
+//            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][6] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][6] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width,
+//                     wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//                     ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//                     fringe_cap);
+        }
+    g_tp.wire_local.pitch    += curr_alpha * wire_pitch[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.R_per_um += curr_alpha * wire_r_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.C_per_um += curr_alpha * wire_c_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+
+    g_tp.wire_inside_mat.pitch     += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.R_per_um  += curr_alpha* wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.C_per_um  += curr_alpha* wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+
+    g_tp.wire_outside_mat.pitch    += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.R_per_um += curr_alpha*wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.C_per_um += curr_alpha*wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+
+    g_tp.unit_len_wire_del = g_tp.wire_inside_mat.R_per_um * g_tp.wire_inside_mat.C_per_um / 2;
+
+    g_tp.sense_delay               += curr_alpha *SENSE_AMP_D;
+    g_tp.sense_dy_power            += curr_alpha *SENSE_AMP_P;
+//    g_tp.horiz_dielectric_constant += horiz_dielectric_constant;
+//    g_tp.vert_dielectric_constant  += vert_dielectric_constant;
+//    g_tp.aspect_ratio              += aspect_ratio;
+//    g_tp.miller_value              += miller_value;
+//    g_tp.ild_thickness             += ild_thickness;
+
+  }
+  g_tp.fringe_cap = fringe_cap;
+
+  double rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  double p_to_n_sizing_r = pmos_to_nmos_sz_ratio();
+  double c_load = gate_C(g_tp.min_w_nmos_ * (1 + p_to_n_sizing_r), 0.0);
+  double tf = rd * c_load;
+  g_tp.kinv = horowitz(0, tf, 0.5, 0.5, RISE);
+  double KLOAD = 1;
+  c_load = KLOAD * (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+                    drain_C_(g_tp.min_w_nmos_ * p_to_n_sizing_r, PCH, 1, 1, g_tp.cell_h_def) +
+                    gate_C(g_tp.min_w_nmos_ * 4 * (1 + p_to_n_sizing_r), 0.0));
+  tf = rd * c_load;
+  g_tp.FO4 = horowitz(0, tf, 0.5, 0.5, RISE);
+}
+
diff --git a/ext/mcpat/version.h b/ext/mcpat/version.h
new file mode 100644 (file)
index 0000000..76d8c75
--- /dev/null
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef VERSION_H_
+#define VERSION_H_
+
+#define VER_MAJOR              0       /* beta release */
+#define VER_MINOR              8
+
+#define VER_UPDATE             "Aug, 2010"
+
+#endif /* VERSION_H_ */
diff --git a/ext/mcpat/xmlParser.cc b/ext/mcpat/xmlParser.cc
new file mode 100644 (file)
index 0000000..5ac45ed
--- /dev/null
@@ -0,0 +1,2891 @@
+/**
+ ****************************************************************************
+ * <P> XML.c - implementation file for basic XML parser written in ANSI C++
+ * for portability. It works by using recursion and a node tree for breaking
+ * down the elements of an XML document.  </P>
+ *
+ * @version     V2.41
+ * @author      Frank Vanden Berghen
+ *
+ * NOTE:
+ *
+ *   If you add "#define STRICT_PARSING", on the first line of this file
+ *   the parser will see the following XML-stream:
+ *      <a><b>some text</b><b>other text    </a>
+ *   as an error. Otherwise, this tring will be equivalent to:
+ *      <a><b>some text</b><b>other text</b></a>
+ *
+ * NOTE:
+ *
+ *   If you add "#define APPROXIMATE_PARSING" on the first line of this file
+ *   the parser will see the following XML-stream:
+ *     <data name="n1">
+ *     <data name="n2">
+ *     <data name="n3" />
+ *   as equivalent to the following XML-stream:
+ *     <data name="n1" />
+ *     <data name="n2" />
+ *     <data name="n3" />
+ *   This can be useful for badly-formed XML-streams but prevent the use
+ *   of the following XML-stream (problem is: tags at contiguous levels
+ *   have the same names):
+ *     <data name="n1">
+ *        <data name="n2">
+ *            <data name="n3" />
+ *        </data>
+ *     </data>
+ *
+ * NOTE:
+ *
+ *   If you add "#define _XMLPARSER_NO_MESSAGEBOX_" on the first line of this file
+ *   the "openFileHelper" function will always display error messages inside the
+ *   console instead of inside a message-box-window. Message-box-windows are
+ *   available on windows 9x/NT/2000/XP/Vista only.
+ *
+ * The following license terms for the "XMLParser library from Business-Insight" apply to projects
+ * that are in some way related to
+ * the "mcpat project", including applications
+ * using "mcpat project" and tools developed
+ * for enhancing "mcpat project". All other projects
+ * (not related to "mcpat project") have to use the "XMLParser library from Business-Insight"
+ * code under the Aladdin Free Public License (AFPL)
+ * See the file "AFPL-license.txt" for more informations about the AFPL license.
+ * (see http://www.artifex.com/downloads/doc/Public.htm for detailed AFPL terms)
+ *
+ * Redistribution and use of the "XMLParser library from Business-Insight" in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Frank Vanden Berghen nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Business-Insight ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Business-Insight BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2002, Business-Insight
+ * <a href="http://www.Business-Insight.com">Business-Insight</a>
+ * All rights reserved.
+ *
+ ****************************************************************************
+ */
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#include "xmlParser.h"
+#ifdef _XMLWINDOWS
+//#ifdef _DEBUG
+//#define _CRTDBG_MAP_ALLOC
+//#include <crtdbg.h>
+//#endif
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h> // to have IsTextUnicode, MultiByteToWideChar, WideCharToMultiByte to handle unicode files
+                     // to have "MessageBoxA" to display error messages for openFilHelper
+#endif
+
+#include <memory.h>
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+XMLCSTR XMLNode::getVersion() { return _CXML("v2.39"); }
+void freeXMLString(XMLSTR t){if(t)free(t);}
+
+static XMLNode::XMLCharEncoding characterEncoding=XMLNode::char_encoding_UTF8;
+static char guessWideCharChars=1, dropWhiteSpace=1, removeCommentsInMiddleOfText=1;
+
+inline int mmin( const int t1, const int t2 ) { return t1 < t2 ? t1 : t2; }
+
+// You can modify the initialization of the variable "XMLClearTags" below
+// to change the clearTags that are currently recognized by the library.
+// The number on the second columns is the length of the string inside the
+// first column. The "<!DOCTYPE" declaration must be the second in the list.
+// The "<!--" declaration must be the third in the list.
+typedef struct { XMLCSTR lpszOpen; int openTagLen; XMLCSTR lpszClose;} ALLXMLClearTag;
+static ALLXMLClearTag XMLClearTags[] =
+{
+    {    _CXML("<![CDATA["),9,  _CXML("]]>")      },
+    {    _CXML("<!DOCTYPE"),9,  _CXML(">")        },
+    {    _CXML("<!--")     ,4,  _CXML("-->")      },
+    {    _CXML("<PRE>")    ,5,  _CXML("</PRE>")   },
+//  {    _CXML("<Script>") ,8,  _CXML("</Script>")},
+    {    NULL              ,0,  NULL           }
+};
+
+// You can modify the initialization of the variable "XMLEntities" below
+// to change the character entities that are currently recognized by the library.
+// The number on the second columns is the length of the string inside the
+// first column. Additionally, the syntaxes "&#xA0;" and "&#160;" are recognized.
+typedef struct { XMLCSTR s; int l; XMLCHAR c;} XMLCharacterEntity;
+static XMLCharacterEntity XMLEntities[] =
+{
+    { _CXML("&amp;" ), 5, _CXML('&' )},
+    { _CXML("&lt;"  ), 4, _CXML('<' )},
+    { _CXML("&gt;"  ), 4, _CXML('>' )},
+    { _CXML("&quot;"), 6, _CXML('\"')},
+    { _CXML("&apos;"), 6, _CXML('\'')},
+    { NULL           , 0, '\0'    }
+};
+
+// When rendering the XMLNode to a string (using the "createXMLString" function),
+// you can ask for a beautiful formatting. This formatting is using the
+// following indentation character:
+#define INDENTCHAR _CXML('\t')
+
+// The following function parses the XML errors into a user friendly string.
+// You can edit this to change the output language of the library to something else.
+XMLCSTR XMLNode::getError(XMLError xerror)
+{
+    switch (xerror)
+    {
+    case eXMLErrorNone:                  return _CXML("No error");
+    case eXMLErrorMissingEndTag:         return _CXML("Warning: Unmatched end tag");
+    case eXMLErrorNoXMLTagFound:         return _CXML("Warning: No XML tag found");
+    case eXMLErrorEmpty:                 return _CXML("Error: No XML data");
+    case eXMLErrorMissingTagName:        return _CXML("Error: Missing start tag name");
+    case eXMLErrorMissingEndTagName:     return _CXML("Error: Missing end tag name");
+    case eXMLErrorUnmatchedEndTag:       return _CXML("Error: Unmatched end tag");
+    case eXMLErrorUnmatchedEndClearTag:  return _CXML("Error: Unmatched clear tag end");
+    case eXMLErrorUnexpectedToken:       return _CXML("Error: Unexpected token found");
+    case eXMLErrorNoElements:            return _CXML("Error: No elements found");
+    case eXMLErrorFileNotFound:          return _CXML("Error: File not found");
+    case eXMLErrorFirstTagNotFound:      return _CXML("Error: First Tag not found");
+    case eXMLErrorUnknownCharacterEntity:return _CXML("Error: Unknown character entity");
+    case eXMLErrorCharacterCodeAbove255: return _CXML("Error: Character code above 255 is forbidden in MultiByte char mode.");
+    case eXMLErrorCharConversionError:   return _CXML("Error: unable to convert between WideChar and MultiByte chars");
+    case eXMLErrorCannotOpenWriteFile:   return _CXML("Error: unable to open file for writing");
+    case eXMLErrorCannotWriteFile:       return _CXML("Error: cannot write into file");
+
+    case eXMLErrorBase64DataSizeIsNotMultipleOf4: return _CXML("Warning: Base64-string length is not a multiple of 4");
+    case eXMLErrorBase64DecodeTruncatedData:      return _CXML("Warning: Base64-string is truncated");
+    case eXMLErrorBase64DecodeIllegalCharacter:   return _CXML("Error: Base64-string contains an illegal character");
+    case eXMLErrorBase64DecodeBufferTooSmall:     return _CXML("Error: Base64 decode output buffer is too small");
+    };
+    return _CXML("Unknown");
+}
+
+/////////////////////////////////////////////////////////////////////////
+//      Here start the abstraction layer to be OS-independent          //
+/////////////////////////////////////////////////////////////////////////
+
+// Here is an abstraction layer to access some common string manipulation functions.
+// The abstraction layer is currently working for gcc, Microsoft Visual Studio 6.0,
+// Microsoft Visual Studio .NET, CC (sun compiler) and Borland C++.
+// If you plan to "port" the library to a new system/compiler, all you have to do is
+// to edit the following lines.
+#ifdef XML_NO_WIDE_CHAR
+char myIsTextWideChar(const void *b, int len) { return FALSE; }
+#else
+    #if defined (UNDER_CE) || !defined(_XMLWINDOWS)
+    char myIsTextWideChar(const void *b, int len) // inspired by the Wine API: RtlIsTextUnicode
+    {
+#ifdef sun
+        // for SPARC processors: wchar_t* buffers must always be alligned, otherwise it's a char* buffer.
+        if ((((unsigned long)b)%sizeof(wchar_t))!=0) return FALSE;
+#endif
+        const wchar_t *s=(const wchar_t*)b;
+
+        // buffer too small:
+        if (len<(int)sizeof(wchar_t)) return FALSE;
+
+        // odd length test
+        if (len&1) return FALSE;
+
+        /* only checks the first 256 characters */
+        len=mmin(256,len/sizeof(wchar_t));
+
+        // Check for the special byte order:
+        if (*((unsigned short*)s) == 0xFFFE) return TRUE;     // IS_TEXT_UNICODE_REVERSE_SIGNATURE;
+        if (*((unsigned short*)s) == 0xFEFF) return TRUE;      // IS_TEXT_UNICODE_SIGNATURE
+
+        // checks for ASCII characters in the UNICODE stream
+        int i,stats=0;
+        for (i=0; i<len; i++) if (s[i]<=(unsigned short)255) stats++;
+        if (stats>len/2) return TRUE;
+
+        // Check for UNICODE NULL chars
+        for (i=0; i<len; i++) if (!s[i]) return TRUE;
+
+        return FALSE;
+    }
+    #else
+    char myIsTextWideChar(const void *b,int l) { return (char)IsTextUnicode((CONST LPVOID)b,l,NULL); };
+    #endif
+#endif
+
+#ifdef _XMLWINDOWS
+// for Microsoft Visual Studio 6.0 and Microsoft Visual Studio .NET and Borland C++ Builder 6.0
+    #ifdef _XMLWIDECHAR
+        wchar_t *myMultiByteToWideChar(const char *s, XMLNode::XMLCharEncoding ce)
+        {
+            int i;
+            if (ce==XMLNode::char_encoding_UTF8) i=(int)MultiByteToWideChar(CP_UTF8,0             ,s,-1,NULL,0);
+            else                            i=(int)MultiByteToWideChar(CP_ACP ,MB_PRECOMPOSED,s,-1,NULL,0);
+            if (i<0) return NULL;
+            wchar_t *d=(wchar_t *)malloc((i+1)*sizeof(XMLCHAR));
+            if (ce==XMLNode::char_encoding_UTF8) i=(int)MultiByteToWideChar(CP_UTF8,0             ,s,-1,d,i);
+            else                            i=(int)MultiByteToWideChar(CP_ACP ,MB_PRECOMPOSED,s,-1,d,i);
+            d[i]=0;
+            return d;
+        }
+        static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) { return _wfopen(filename,mode); }
+        static inline int xstrlen(XMLCSTR c)   { return (int)wcslen(c); }
+        static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return _wcsnicmp(c1,c2,l);}
+        static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return wcsncmp(c1,c2,l);}
+        static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return _wcsicmp(c1,c2); }
+        static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)wcsstr(c1,c2); }
+        static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)wcscpy(c1,c2); }
+    #else
+        char *myWideCharToMultiByte(const wchar_t *s)
+        {
+            UINT codePage=CP_ACP; if (characterEncoding==XMLNode::char_encoding_UTF8) codePage=CP_UTF8;
+            int i=(int)WideCharToMultiByte(codePage,  // code page
+                0,                       // performance and mapping flags
+                s,                       // wide-character string
+                -1,                       // number of chars in string
+                NULL,                       // buffer for new string
+                0,                       // size of buffer
+                NULL,                    // default for unmappable chars
+                NULL                     // set when default char used
+                );
+            if (i<0) return NULL;
+            char *d=(char*)malloc(i+1);
+            WideCharToMultiByte(codePage,  // code page
+                0,                       // performance and mapping flags
+                s,                       // wide-character string
+                -1,                       // number of chars in string
+                d,                       // buffer for new string
+                i,                       // size of buffer
+                NULL,                    // default for unmappable chars
+                NULL                     // set when default char used
+                );
+            d[i]=0;
+            return d;
+        }
+        static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) { return fopen(filename,mode); }
+        static inline int xstrlen(XMLCSTR c)   { return (int)strlen(c); }
+        #ifdef __BORLANDC__
+            static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return strnicmp(c1,c2,l);}
+            static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return stricmp(c1,c2); }
+        #else
+            static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return _strnicmp(c1,c2,l);}
+            static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return _stricmp(c1,c2); }
+        #endif
+        static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return strncmp(c1,c2,l);}
+        static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)strstr(c1,c2); }
+        static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)strcpy(c1,c2); }
+    #endif
+#else
+// for gcc and CC
+    #ifdef XML_NO_WIDE_CHAR
+        char *myWideCharToMultiByte(const wchar_t *s) { return NULL; }
+    #else
+        char *myWideCharToMultiByte(const wchar_t *s)
+        {
+            const wchar_t *ss=s;
+            int i=(int)wcsrtombs(NULL,&ss,0,NULL);
+            if (i<0) return NULL;
+            char *d=(char *)malloc(i+1);
+            wcsrtombs(d,&s,i,NULL);
+            d[i]=0;
+            return d;
+        }
+    #endif
+    #ifdef _XMLWIDECHAR
+        wchar_t *myMultiByteToWideChar(const char *s, XMLNode::XMLCharEncoding ce)
+        {
+            const char *ss=s;
+            int i=(int)mbsrtowcs(NULL,&ss,0,NULL);
+            if (i<0) return NULL;
+            wchar_t *d=(wchar_t *)malloc((i+1)*sizeof(wchar_t));
+            mbsrtowcs(d,&s,i,NULL);
+            d[i]=0;
+            return d;
+        }
+        int xstrlen(XMLCSTR c)   { return wcslen(c); }
+        #ifdef sun
+        // for CC
+           #include <widec.h>
+           static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return wsncasecmp(c1,c2,l);}
+           static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return wsncmp(c1,c2,l);}
+           static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return wscasecmp(c1,c2); }
+        #else
+        // for gcc
+           static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return wcsncasecmp(c1,c2,l);}
+           static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return wcsncmp(c1,c2,l);}
+           static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return wcscasecmp(c1,c2); }
+        #endif
+        static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)wcsstr(c1,c2); }
+        static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)wcscpy(c1,c2); }
+        static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode)
+        {
+            char *filenameAscii=myWideCharToMultiByte(filename);
+            FILE *f;
+            if (mode[0]==_CXML('r')) f=fopen(filenameAscii,"rb");
+            else                     f=fopen(filenameAscii,"wb");
+            free(filenameAscii);
+            return f;
+        }
+    #else
+        static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) { return fopen(filename,mode); }
+        static inline int xstrlen(XMLCSTR c)   { return strlen(c); }
+        static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return strncasecmp(c1,c2,l);}
+        static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return strncmp(c1,c2,l);}
+        static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return strcasecmp(c1,c2); }
+        static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)strstr(c1,c2); }
+        static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)strcpy(c1,c2); }
+    #endif
+    static inline int _strnicmp(const char *c1,const char *c2, int l) { return strncasecmp(c1,c2,l);}
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////////
+//            the "xmltoc,xmltob,xmltoi,xmltol,xmltof,xmltoa" functions      //
+///////////////////////////////////////////////////////////////////////////////
+// These 6 functions are not used inside the XMLparser.
+// There are only here as "convenience" functions for the user.
+// If you don't need them, you can delete them without any trouble.
+#ifdef _XMLWIDECHAR
+    #ifdef _XMLWINDOWS
+    // for Microsoft Visual Studio 6.0 and Microsoft Visual Studio .NET and Borland C++ Builder 6.0
+        char    xmltob(XMLCSTR t,int     v){ if (t&&(*t)) return (char)_wtoi(t); return v; }
+        int     xmltoi(XMLCSTR t,int     v){ if (t&&(*t)) return _wtoi(t); return v; }
+        long    xmltol(XMLCSTR t,long    v){ if (t&&(*t)) return _wtol(t); return v; }
+        double  xmltof(XMLCSTR t,double  v){ if (t&&(*t)) wscanf(t, "%f", &v); /*v=_wtof(t);*/ return v; }
+    #else
+        #ifdef sun
+        // for CC
+           #include <widec.h>
+           char    xmltob(XMLCSTR t,int     v){ if (t) return (char)wstol(t,NULL,10); return v; }
+           int     xmltoi(XMLCSTR t,int     v){ if (t) return (int)wstol(t,NULL,10); return v; }
+           long    xmltol(XMLCSTR t,long    v){ if (t) return wstol(t,NULL,10); return v; }
+        #else
+        // for gcc
+           char    xmltob(XMLCSTR t,int     v){ if (t) return (char)wcstol(t,NULL,10); return v; }
+           int     xmltoi(XMLCSTR t,int     v){ if (t) return (int)wcstol(t,NULL,10); return v; }
+           long    xmltol(XMLCSTR t,long    v){ if (t) return wcstol(t,NULL,10); return v; }
+        #endif
+                double  xmltof(XMLCSTR t,double  v){ if (t&&(*t)) wscanf(t, "%f", &v); /*v=_wtof(t);*/ return v; }
+    #endif
+#else
+    char    xmltob(XMLCSTR t,char    v){ if (t&&(*t)) return (char)atoi(t); return v; }
+    int     xmltoi(XMLCSTR t,int     v){ if (t&&(*t)) return atoi(t); return v; }
+    long    xmltol(XMLCSTR t,long    v){ if (t&&(*t)) return atol(t); return v; }
+    double  xmltof(XMLCSTR t,double  v){ if (t&&(*t)) return atof(t); return v; }
+#endif
+XMLCSTR xmltoa(XMLCSTR t,XMLCSTR v){ if (t)       return  t; return v; }
+XMLCHAR xmltoc(XMLCSTR t,XMLCHAR v){ if (t&&(*t)) return *t; return v; }
+
+/////////////////////////////////////////////////////////////////////////
+//                    the "openFileHelper" function                    //
+/////////////////////////////////////////////////////////////////////////
+
+// Since each application has its own way to report and deal with errors, you should modify & rewrite
+// the following "openFileHelper" function to get an "error reporting mechanism" tailored to your needs.
+XMLNode XMLNode::openFileHelper(XMLCSTR filename, XMLCSTR tag)
+{
+    // guess the value of the global parameter "characterEncoding"
+    // (the guess is based on the first 200 bytes of the file).
+    FILE *f=xfopen(filename,_CXML("rb"));
+    if (f)
+    {
+        char bb[205];
+        int l=(int)fread(bb,1,200,f);
+        setGlobalOptions(guessCharEncoding(bb,l),guessWideCharChars,dropWhiteSpace,removeCommentsInMiddleOfText);
+        fclose(f);
+    }
+
+    // parse the file
+    XMLResults pResults;
+    XMLNode xnode=XMLNode::parseFile(filename,tag,&pResults);
+
+    // display error message (if any)
+    if (pResults.error != eXMLErrorNone)
+    {
+        // create message
+        char message[2000],*s1=(char*)"",*s3=(char*)""; XMLCSTR s2=_CXML("");
+        if (pResults.error==eXMLErrorFirstTagNotFound) { s1=(char*)"First Tag should be '"; s2=tag; s3=(char*)"'.\n"; }
+        sprintf(message,
+#ifdef _XMLWIDECHAR
+            "XML Parsing error inside file '%S'.\n%S\nAt line %i, column %i.\n%s%S%s"
+#else
+            "XML Parsing error inside file '%s'.\n%s\nAt line %i, column %i.\n%s%s%s"
+#endif
+            ,filename,XMLNode::getError(pResults.error),pResults.nLine,pResults.nColumn,s1,s2,s3);
+
+        // display message
+#if defined(_XMLWINDOWS) && !defined(UNDER_CE) && !defined(_XMLPARSER_NO_MESSAGEBOX_)
+        MessageBoxA(NULL,message,"XML Parsing error",MB_OK|MB_ICONERROR|MB_TOPMOST);
+#else
+        printf("%s",message);
+#endif
+        exit(255);
+    }
+    return xnode;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//      Here start the core implementation of the XMLParser library    //
+/////////////////////////////////////////////////////////////////////////
+
+// You should normally not change anything below this point.
+
+#ifndef _XMLWIDECHAR
+// If "characterEncoding=ascii" then we assume that all characters have the same length of 1 byte.
+// If "characterEncoding=UTF8" then the characters have different lengths (from 1 byte to 4 bytes).
+// If "characterEncoding=ShiftJIS" then the characters have different lengths (from 1 byte to 2 bytes).
+// This table is used as lookup-table to know the length of a character (in byte) based on the
+// content of the first byte of the character.
+// (note: if you modify this, you must always have XML_utf8ByteTable[0]=0 ).
+static const char XML_utf8ByteTable[256] =
+{
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70 End of ASCII range
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x80 0x80 to 0xc1 invalid
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x90
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xa0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xb0
+    1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0 0xc2 to 0xdf 2 byte
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,// 0xe0 0xe0 to 0xef 3 byte
+    4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
+};
+static const char XML_legacyByteTable[256] =
+{
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+static const char XML_sjisByteTable[256] =
+{
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70
+    1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x80 0x81 to 0x9F 2 bytes
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x90
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xa0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xb0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xc0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xd0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xe0 0xe0 to 0xef 2 bytes
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // 0xf0
+};
+static const char XML_gb2312ByteTable[256] =
+{
+//  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x80
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x90
+    1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xa0 0xa1 to 0xf7 2 bytes
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xb0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xe0
+    2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1 // 0xf0
+};
+static const char XML_gbk_big5_ByteTable[256] =
+{
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70
+    1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x80 0x81 to 0xfe 2 bytes
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x90
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xa0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xb0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xe0
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1 // 0xf0
+};
+static const char *XML_ByteTable=(const char *)XML_utf8ByteTable; // the default is "characterEncoding=XMLNode::encoding_UTF8"
+#endif
+
+
+XMLNode XMLNode::emptyXMLNode;
+XMLClear XMLNode::emptyXMLClear={ NULL, NULL, NULL};
+XMLAttribute XMLNode::emptyXMLAttribute={ NULL, NULL};
+
+// Enumeration used to decipher what type a token is
+typedef enum XMLTokenTypeTag
+{
+    eTokenText = 0,
+    eTokenQuotedText,
+    eTokenTagStart,         /* "<"            */
+    eTokenTagEnd,           /* "</"           */
+    eTokenCloseTag,         /* ">"            */
+    eTokenEquals,           /* "="            */
+    eTokenDeclaration,      /* "<?"           */
+    eTokenShortHandClose,   /* "/>"           */
+    eTokenClear,
+    eTokenError
+} XMLTokenType;
+
+// Main structure used for parsing XML
+typedef struct XML
+{
+    XMLCSTR                lpXML;
+    XMLCSTR                lpszText;
+    int                    nIndex,nIndexMissigEndTag;
+    enum XMLError          error;
+    XMLCSTR                lpEndTag;
+    int                    cbEndTag;
+    XMLCSTR                lpNewElement;
+    int                    cbNewElement;
+    int                    nFirst;
+} XML;
+
+typedef struct
+{
+    ALLXMLClearTag *pClr;
+    XMLCSTR     pStr;
+} NextToken;
+
+// Enumeration used when parsing attributes
+typedef enum Attrib
+{
+    eAttribName = 0,
+    eAttribEquals,
+    eAttribValue
+} Attrib;
+
+// Enumeration used when parsing elements to dictate whether we are currently
+// inside a tag
+typedef enum Status
+{
+    eInsideTag = 0,
+    eOutsideTag
+} Status;
+
+XMLError XMLNode::writeToFile(XMLCSTR filename, const char *encoding, char nFormat) const
+{
+    if (!d) return eXMLErrorNone;
+    FILE *f=xfopen(filename,_CXML("wb"));
+    if (!f) return eXMLErrorCannotOpenWriteFile;
+#ifdef _XMLWIDECHAR
+    unsigned char h[2]={ 0xFF, 0xFE };
+    if (!fwrite(h,2,1,f)) return eXMLErrorCannotWriteFile;
+    if ((!isDeclaration())&&((d->lpszName)||(!getChildNode().isDeclaration())))
+    {
+        if (!fwrite(L"<?xml version=\"1.0\" encoding=\"utf-16\"?>\n",sizeof(wchar_t)*40,1,f))
+            return eXMLErrorCannotWriteFile;
+    }
+#else
+    if ((!isDeclaration())&&((d->lpszName)||(!getChildNode().isDeclaration())))
+    {
+        if (characterEncoding==char_encoding_UTF8)
+        {
+            // header so that windows recognize the file as UTF-8:
+            unsigned char h[3]={0xEF,0xBB,0xBF}; if (!fwrite(h,3,1,f)) return eXMLErrorCannotWriteFile;
+            encoding="utf-8";
+        } else if (characterEncoding==char_encoding_ShiftJIS) encoding="SHIFT-JIS";
+
+        if (!encoding) encoding="ISO-8859-1";
+        if (fprintf(f,"<?xml version=\"1.0\" encoding=\"%s\"?>\n",encoding)<0) return eXMLErrorCannotWriteFile;
+    } else
+    {
+        if (characterEncoding==char_encoding_UTF8)
+        {
+            unsigned char h[3]={0xEF,0xBB,0xBF}; if (!fwrite(h,3,1,f)) return eXMLErrorCannotWriteFile;
+        }
+    }
+#endif
+    int i;
+    XMLSTR t=createXMLString(nFormat,&i);
+    if (!fwrite(t,sizeof(XMLCHAR)*i,1,f)) return eXMLErrorCannotWriteFile;
+    if (fclose(f)!=0) return eXMLErrorCannotWriteFile;
+    free(t);
+    return eXMLErrorNone;
+}
+
+// Duplicate a given string.
+XMLSTR stringDup(XMLCSTR lpszData, int cbData)
+{
+    if (lpszData==NULL) return NULL;
+
+    XMLSTR lpszNew;
+    if (cbData==-1) cbData=(int)xstrlen(lpszData);
+    lpszNew = (XMLSTR)malloc((cbData+1) * sizeof(XMLCHAR));
+    if (lpszNew)
+    {
+        memcpy(lpszNew, lpszData, (cbData) * sizeof(XMLCHAR));
+        lpszNew[cbData] = (XMLCHAR)NULL;
+    }
+    return lpszNew;
+}
+
+XMLSTR ToXMLStringTool::toXMLUnSafe(XMLSTR dest,XMLCSTR source)
+{
+    XMLSTR dd=dest;
+    XMLCHAR ch;
+    XMLCharacterEntity *entity;
+    while ((ch=*source))
+    {
+        entity=XMLEntities;
+        do
+        {
+            if (ch==entity->c) {xstrcpy(dest,entity->s); dest+=entity->l; source++; goto out_of_loop1; }
+            entity++;
+        } while(entity->s);
+#ifdef _XMLWIDECHAR
+        *(dest++)=*(source++);
+#else
+        switch(XML_ByteTable[(unsigned char)ch])
+        {
+        case 4: *(dest++)=*(source++);
+        case 3: *(dest++)=*(source++);
+        case 2: *(dest++)=*(source++);
+        case 1: *(dest++)=*(source++);
+        }
+#endif
+out_of_loop1:
+        ;
+    }
+    *dest=0;
+    return dd;
+}
+
+// private (used while rendering):
+int ToXMLStringTool::lengthXMLString(XMLCSTR source)
+{
+    int r=0;
+    XMLCharacterEntity *entity;
+    XMLCHAR ch;
+    while ((ch=*source))
+    {
+        entity=XMLEntities;
+        do
+        {
+            if (ch==entity->c) { r+=entity->l; source++; goto out_of_loop1; }
+            entity++;
+        } while(entity->s);
+#ifdef _XMLWIDECHAR
+        r++; source++;
+#else
+        ch=XML_ByteTable[(unsigned char)ch]; r+=ch; source+=ch;
+#endif
+out_of_loop1:
+        ;
+    }
+    return r;
+}
+
+ToXMLStringTool::~ToXMLStringTool(){ freeBuffer(); }
+void ToXMLStringTool::freeBuffer(){ if (buf) free(buf); buf=NULL; buflen=0; }
+XMLSTR ToXMLStringTool::toXML(XMLCSTR source)
+{
+    int l=lengthXMLString(source)+1;
+    if (l>buflen) { buflen=l; buf=(XMLSTR)realloc(buf,l*sizeof(XMLCHAR)); }
+    return toXMLUnSafe(buf,source);
+}
+
+// private:
+XMLSTR fromXMLString(XMLCSTR s, int lo, XML *pXML)
+{
+    // This function is the opposite of the function "toXMLString". It decodes the escape
+    // sequences &amp;, &quot;, &apos;, &lt;, &gt; and replace them by the characters
+    // &,",',<,>. This function is used internally by the XML Parser. All the calls to
+    // the XML library will always gives you back "decoded" strings.
+    //
+    // in: string (s) and length (lo) of string
+    // out:  new allocated string converted from xml
+    if (!s) return NULL;
+
+    int ll=0,j;
+    XMLSTR d;
+    XMLCSTR ss=s;
+    XMLCharacterEntity *entity;
+    while ((lo>0)&&(*s))
+    {
+        if (*s==_CXML('&'))
+        {
+            if ((lo>2)&&(s[1]==_CXML('#')))
+            {
+                s+=2; lo-=2;
+                if ((*s==_CXML('X'))||(*s==_CXML('x'))) { s++; lo--; }
+                while ((*s)&&(*s!=_CXML(';'))&&((lo--)>0)) s++;
+                if (*s!=_CXML(';'))
+                {
+                    pXML->error=eXMLErrorUnknownCharacterEntity;
+                    return NULL;
+                }
+                s++; lo--;
+            } else
+            {
+                entity=XMLEntities;
+                do
+                {
+                    if ((lo>=entity->l)&&(xstrnicmp(s,entity->s,entity->l)==0)) { s+=entity->l; lo-=entity->l; break; }
+                    entity++;
+                } while(entity->s);
+                if (!entity->s)
+                {
+                    pXML->error=eXMLErrorUnknownCharacterEntity;
+                    return NULL;
+                }
+            }
+        } else
+        {
+#ifdef _XMLWIDECHAR
+            s++; lo--;
+#else
+            j=XML_ByteTable[(unsigned char)*s]; s+=j; lo-=j; ll+=j-1;
+#endif
+        }
+        ll++;
+    }
+
+    d=(XMLSTR)malloc((ll+1)*sizeof(XMLCHAR));
+    s=d;
+    while (ll-->0)
+    {
+        if (*ss==_CXML('&'))
+        {
+            if (ss[1]==_CXML('#'))
+            {
+                ss+=2; j=0;
+                if ((*ss==_CXML('X'))||(*ss==_CXML('x')))
+                {
+                    ss++;
+                    while (*ss!=_CXML(';'))
+                    {
+                        if ((*ss>=_CXML('0'))&&(*ss<=_CXML('9'))) j=(j<<4)+*ss-_CXML('0');
+                        else if ((*ss>=_CXML('A'))&&(*ss<=_CXML('F'))) j=(j<<4)+*ss-_CXML('A')+10;
+                        else if ((*ss>=_CXML('a'))&&(*ss<=_CXML('f'))) j=(j<<4)+*ss-_CXML('a')+10;
+                        else { free((void*)s); pXML->error=eXMLErrorUnknownCharacterEntity;return NULL;}
+                        ss++;
+                    }
+                } else
+                {
+                    while (*ss!=_CXML(';'))
+                    {
+                        if ((*ss>=_CXML('0'))&&(*ss<=_CXML('9'))) j=(j*10)+*ss-_CXML('0');
+                        else { free((void*)s); pXML->error=eXMLErrorUnknownCharacterEntity;return NULL;}
+                        ss++;
+                    }
+                }
+#ifndef _XMLWIDECHAR
+                if (j>255) { free((void*)s); pXML->error=eXMLErrorCharacterCodeAbove255;return NULL;}
+#endif
+                (*d++)=(XMLCHAR)j; ss++;
+            } else
+            {
+                entity=XMLEntities;
+                do
+                {
+                    if (xstrnicmp(ss,entity->s,entity->l)==0) { *(d++)=entity->c; ss+=entity->l; break; }
+                    entity++;
+                } while(entity->s);
+            }
+        } else
+        {
+#ifdef _XMLWIDECHAR
+            *(d++)=*(ss++);
+#else
+            switch(XML_ByteTable[(unsigned char)*ss])
+            {
+            case 4: *(d++)=*(ss++); ll--;
+            case 3: *(d++)=*(ss++); ll--;
+            case 2: *(d++)=*(ss++); ll--;
+            case 1: *(d++)=*(ss++);
+            }
+#endif
+        }
+    }
+    *d=0;
+    return (XMLSTR)s;
+}
+
+#define XML_isSPACECHAR(ch) ((ch==_CXML('\n'))||(ch==_CXML(' '))||(ch== _CXML('\t'))||(ch==_CXML('\r')))
+
+// private:
+char myTagCompare(XMLCSTR cclose, XMLCSTR copen)
+// !!!! WARNING strange convention&:
+// return 0 if equals
+// return 1 if different
+{
+    if (!cclose) return 1;
+    int l=(int)xstrlen(cclose);
+    if (xstrnicmp(cclose, copen, l)!=0) return 1;
+    const XMLCHAR c=copen[l];
+    if (XML_isSPACECHAR(c)||
+        (c==_CXML('/' ))||
+        (c==_CXML('<' ))||
+        (c==_CXML('>' ))||
+        (c==_CXML('=' ))) return 0;
+    return 1;
+}
+
+// Obtain the next character from the string.
+static inline XMLCHAR getNextChar(XML *pXML)
+{
+    XMLCHAR ch = pXML->lpXML[pXML->nIndex];
+#ifdef _XMLWIDECHAR
+    if (ch!=0) pXML->nIndex++;
+#else
+    pXML->nIndex+=XML_ByteTable[(unsigned char)ch];
+#endif
+    return ch;
+}
+
+// Find the next token in a string.
+// pcbToken contains the number of characters that have been read.
+static NextToken GetNextToken(XML *pXML, int *pcbToken, enum XMLTokenTypeTag *pType)
+{
+    NextToken        result;
+    XMLCHAR            ch;
+    XMLCHAR            chTemp;
+    int              indexStart,nFoundMatch,nIsText=FALSE;
+    result.pClr=NULL; // prevent warning
+
+    // Find next non-white space character
+    do { indexStart=pXML->nIndex; ch=getNextChar(pXML); } while XML_isSPACECHAR(ch);
+
+    if (ch)
+    {
+        // Cache the current string pointer
+        result.pStr = &pXML->lpXML[indexStart];
+
+        // First check whether the token is in the clear tag list (meaning it
+        // does not need formatting).
+        ALLXMLClearTag *ctag=XMLClearTags;
+        do
+        {
+            if (xstrncmp(ctag->lpszOpen, result.pStr, ctag->openTagLen)==0)
+            {
+                result.pClr=ctag;
+                pXML->nIndex+=ctag->openTagLen-1;
+                *pType=eTokenClear;
+                return result;
+            }
+            ctag++;
+        } while(ctag->lpszOpen);
+
+        // If we didn't find a clear tag then check for standard tokens
+        switch(ch)
+        {
+        // Check for quotes
+        case _CXML('\''):
+        case _CXML('\"'):
+            // Type of token
+            *pType = eTokenQuotedText;
+            chTemp = ch;
+
+            // Set the size
+            nFoundMatch = FALSE;
+
+            // Search through the string to find a matching quote
+            while((ch = getNextChar(pXML)))
+            {
+                if (ch==chTemp) { nFoundMatch = TRUE; break; }
+                if (ch==_CXML('<')) break;
+            }
+
+            // If we failed to find a matching quote
+            if (nFoundMatch == FALSE)
+            {
+                pXML->nIndex=indexStart+1;
+                nIsText=TRUE;
+                break;
+            }
+
+//  4.02.2002
+//            if (FindNonWhiteSpace(pXML)) pXML->nIndex--;
+
+            break;
+
+        // Equals (used with attribute values)
+        case _CXML('='):
+            *pType = eTokenEquals;
+            break;
+
+        // Close tag
+        case _CXML('>'):
+            *pType = eTokenCloseTag;
+            break;
+
+        // Check for tag start and tag end
+        case _CXML('<'):
+
+            // Peek at the next character to see if we have an end tag '</',
+            // or an xml declaration '<?'
+            chTemp = pXML->lpXML[pXML->nIndex];
+
+            // If we have a tag end...
+            if (chTemp == _CXML('/'))
+            {
+                // Set the type and ensure we point at the next character
+                getNextChar(pXML);
+                *pType = eTokenTagEnd;
+            }
+
+            // If we have an XML declaration tag
+            else if (chTemp == _CXML('?'))
+            {
+
+                // Set the type and ensure we point at the next character
+                getNextChar(pXML);
+                *pType = eTokenDeclaration;
+            }
+
+            // Otherwise we must have a start tag
+            else
+            {
+                *pType = eTokenTagStart;
+            }
+            break;
+
+        // Check to see if we have a short hand type end tag ('/>').
+        case _CXML('/'):
+
+            // Peek at the next character to see if we have a short end tag '/>'
+            chTemp = pXML->lpXML[pXML->nIndex];
+
+            // If we have a short hand end tag...
+            if (chTemp == _CXML('>'))
+            {
+                // Set the type and ensure we point at the next character
+                getNextChar(pXML);
+                *pType = eTokenShortHandClose;
+                break;
+            }
+
+            // If we haven't found a short hand closing tag then drop into the
+            // text process
+
+        // Other characters
+        default:
+            nIsText = TRUE;
+        }
+
+        // If this is a TEXT node
+        if (nIsText)
+        {
+            // Indicate we are dealing with text
+            *pType = eTokenText;
+            while((ch = getNextChar(pXML)))
+            {
+                if XML_isSPACECHAR(ch)
+                {
+                    indexStart++; break;
+
+                } else if (ch==_CXML('/'))
+                {
+                    // If we find a slash then this maybe text or a short hand end tag
+                    // Peek at the next character to see it we have short hand end tag
+                    ch=pXML->lpXML[pXML->nIndex];
+                    // If we found a short hand end tag then we need to exit the loop
+                    if (ch==_CXML('>')) { pXML->nIndex--; break; }
+
+                } else if ((ch==_CXML('<'))||(ch==_CXML('>'))||(ch==_CXML('=')))
+                {
+                    pXML->nIndex--; break;
+                }
+            }
+        }
+        *pcbToken = pXML->nIndex-indexStart;
+    } else
+    {
+        // If we failed to obtain a valid character
+        *pcbToken = 0;
+        *pType = eTokenError;
+        result.pStr=NULL;
+    }
+
+    return result;
+}
+
+XMLCSTR XMLNode::updateName_WOSD(XMLSTR lpszName)
+{
+    if (!d) { free(lpszName); return NULL; }
+    if (d->lpszName&&(lpszName!=d->lpszName)) free((void*)d->lpszName);
+    d->lpszName=lpszName;
+    return lpszName;
+}
+
+// private:
+XMLNode::XMLNode(struct XMLNodeDataTag *p){ d=p; (p->ref_count)++; }
+XMLNode::XMLNode(XMLNodeData *pParent, XMLSTR lpszName, char isDeclaration)
+{
+    d=(XMLNodeData*)malloc(sizeof(XMLNodeData));
+    d->ref_count=1;
+
+    d->lpszName=NULL;
+    d->nChild= 0;
+    d->nText = 0;
+    d->nClear = 0;
+    d->nAttribute = 0;
+
+    d->isDeclaration = isDeclaration;
+
+    d->pParent = pParent;
+    d->pChild= NULL;
+    d->pText= NULL;
+    d->pClear= NULL;
+    d->pAttribute= NULL;
+    d->pOrder= NULL;
+
+    updateName_WOSD(lpszName);
+}
+
+XMLNode XMLNode::createXMLTopNode_WOSD(XMLSTR lpszName, char isDeclaration) { return XMLNode(NULL,lpszName,isDeclaration); }
+XMLNode XMLNode::createXMLTopNode(XMLCSTR lpszName, char isDeclaration) { return XMLNode(NULL,stringDup(lpszName),isDeclaration); }
+
+#define MEMORYINCREASE 50
+
+static inline void myFree(void *p) { if (p) free(p); }
+static inline void *myRealloc(void *p, int newsize, int memInc, int sizeofElem)
+{
+    if (p==NULL) { if (memInc) return malloc(memInc*sizeofElem); return malloc(sizeofElem); }
+    if ((memInc==0)||((newsize%memInc)==0)) p=realloc(p,(newsize+memInc)*sizeofElem);
+//    if (!p)
+//    {
+//        printf("XMLParser Error: Not enough memory! Aborting...\n"); exit(220);
+//    }
+    return p;
+}
+
+// private:
+XMLElementPosition XMLNode::findPosition(XMLNodeData *d, int index, XMLElementType xxtype)
+{
+    if (index<0) return -1;
+    int i=0,j=(int)((index<<2)+xxtype),*o=d->pOrder; while (o[i]!=j) i++; return i;
+}
+
+// private:
+// update "order" information when deleting a content of a XMLNode
+int XMLNode::removeOrderElement(XMLNodeData *d, XMLElementType t, int index)
+{
+    int n=d->nChild+d->nText+d->nClear, *o=d->pOrder,i=findPosition(d,index,t);
+    memmove(o+i, o+i+1, (n-i)*sizeof(int));
+    for (;i<n;i++)
+        if ((o[i]&3)==(int)t) o[i]-=4;
+    // We should normally do:
+    // d->pOrder=(int)realloc(d->pOrder,n*sizeof(int));
+    // but we skip reallocation because it's too time consuming.
+    // Anyway, at the end, it will be free'd completely at once.
+    return i;
+}
+
+void *XMLNode::addToOrder(int memoryIncrease,int *_pos, int nc, void *p, int size, XMLElementType xtype)
+{
+    //  in: *_pos is the position inside d->pOrder ("-1" means "EndOf")
+    // out: *_pos is the index inside p
+    p=myRealloc(p,(nc+1),memoryIncrease,size);
+    int n=d->nChild+d->nText+d->nClear;
+    d->pOrder=(int*)myRealloc(d->pOrder,n+1,memoryIncrease*3,sizeof(int));
+    int pos=*_pos,*o=d->pOrder;
+
+    if ((pos<0)||(pos>=n)) { *_pos=nc; o[n]=(int)((nc<<2)+xtype); return p; }
+
+    int i=pos;
+    memmove(o+i+1, o+i, (n-i)*sizeof(int));
+
+    while ((pos<n)&&((o[pos]&3)!=(int)xtype)) pos++;
+    if (pos==n) { *_pos=nc; o[n]=(int)((nc<<2)+xtype); return p; }
+
+    o[i]=o[pos];
+    for (i=pos+1;i<=n;i++) if ((o[i]&3)==(int)xtype) o[i]+=4;
+
+    *_pos=pos=o[pos]>>2;
+    memmove(((char*)p)+(pos+1)*size,((char*)p)+pos*size,(nc-pos)*size);
+
+    return p;
+}
+
+// Add a child node to the given element.
+XMLNode XMLNode::addChild_priv(int memoryIncrease, XMLSTR lpszName, char isDeclaration, int pos)
+{
+    if (!lpszName) return emptyXMLNode;
+    d->pChild=(XMLNode*)addToOrder(memoryIncrease,&pos,d->nChild,d->pChild,sizeof(XMLNode),eNodeChild);
+    d->pChild[pos].d=NULL;
+    d->pChild[pos]=XMLNode(d,lpszName,isDeclaration);
+    d->nChild++;
+    return d->pChild[pos];
+}
+
+// Add an attribute to an element.
+XMLAttribute *XMLNode::addAttribute_priv(int memoryIncrease,XMLSTR lpszName, XMLSTR lpszValuev)
+{
+    if (!lpszName) return &emptyXMLAttribute;
+    if (!d) { myFree(lpszName); myFree(lpszValuev); return &emptyXMLAttribute; }
+    int nc=d->nAttribute;
+    d->pAttribute=(XMLAttribute*)myRealloc(d->pAttribute,(nc+1),memoryIncrease,sizeof(XMLAttribute));
+    XMLAttribute *pAttr=d->pAttribute+nc;
+    pAttr->lpszName = lpszName;
+    pAttr->lpszValue = lpszValuev;
+    d->nAttribute++;
+    return pAttr;
+}
+
+// Add text to the element.
+XMLCSTR XMLNode::addText_priv(int memoryIncrease, XMLSTR lpszValue, int pos)
+{
+    if (!lpszValue) return NULL;
+    if (!d) { myFree(lpszValue); return NULL; }
+    d->pText=(XMLCSTR*)addToOrder(memoryIncrease,&pos,d->nText,d->pText,sizeof(XMLSTR),eNodeText);
+    d->pText[pos]=lpszValue;
+    d->nText++;
+    return lpszValue;
+}
+
+// Add clear (unformatted) text to the element.
+XMLClear *XMLNode::addClear_priv(int memoryIncrease, XMLSTR lpszValue, XMLCSTR lpszOpen, XMLCSTR lpszClose, int pos)
+{
+    if (!lpszValue) return &emptyXMLClear;
+    if (!d) { myFree(lpszValue); return &emptyXMLClear; }
+    d->pClear=(XMLClear *)addToOrder(memoryIncrease,&pos,d->nClear,d->pClear,sizeof(XMLClear),eNodeClear);
+    XMLClear *pNewClear=d->pClear+pos;
+    pNewClear->lpszValue = lpszValue;
+    if (!lpszOpen) lpszOpen=XMLClearTags->lpszOpen;
+    if (!lpszClose) lpszClose=XMLClearTags->lpszClose;
+    pNewClear->lpszOpenTag = lpszOpen;
+    pNewClear->lpszCloseTag = lpszClose;
+    d->nClear++;
+    return pNewClear;
+}
+
+// private:
+// Parse a clear (unformatted) type node.
+char XMLNode::parseClearTag(void *px, void *_pClear)
+{
+    XML *pXML=(XML *)px;
+    ALLXMLClearTag pClear=*((ALLXMLClearTag*)_pClear);
+    int cbTemp=0;
+    XMLCSTR lpszTemp=NULL;
+    XMLCSTR lpXML=&pXML->lpXML[pXML->nIndex];
+    static XMLCSTR docTypeEnd=_CXML("]>");
+
+    // Find the closing tag
+    // Seems the <!DOCTYPE need a better treatment so lets handle it
+    if (pClear.lpszOpen==XMLClearTags[1].lpszOpen)
+    {
+        XMLCSTR pCh=lpXML;
+        while (*pCh)
+        {
+            if (*pCh==_CXML('<')) { pClear.lpszClose=docTypeEnd; lpszTemp=xstrstr(lpXML,docTypeEnd); break; }
+            else if (*pCh==_CXML('>')) { lpszTemp=pCh; break; }
+#ifdef _XMLWIDECHAR
+            pCh++;
+#else
+            pCh+=XML_ByteTable[(unsigned char)(*pCh)];
+#endif
+        }
+    } else lpszTemp=xstrstr(lpXML, pClear.lpszClose);
+
+    if (lpszTemp)
+    {
+        // Cache the size and increment the index
+        cbTemp = (int)(lpszTemp - lpXML);
+
+        pXML->nIndex += cbTemp+(int)xstrlen(pClear.lpszClose);
+
+        // Add the clear node to the current element
+        addClear_priv(MEMORYINCREASE,stringDup(lpXML,cbTemp), pClear.lpszOpen, pClear.lpszClose,-1);
+        return 0;
+    }
+
+    // If we failed to find the end tag
+    pXML->error = eXMLErrorUnmatchedEndClearTag;
+    return 1;
+}
+
+void XMLNode::exactMemory(XMLNodeData *d)
+{
+    if (d->pOrder)     d->pOrder=(int*)realloc(d->pOrder,(d->nChild+d->nText+d->nClear)*sizeof(int));
+    if (d->pChild)     d->pChild=(XMLNode*)realloc(d->pChild,d->nChild*sizeof(XMLNode));
+    if (d->pAttribute) d->pAttribute=(XMLAttribute*)realloc(d->pAttribute,d->nAttribute*sizeof(XMLAttribute));
+    if (d->pText)      d->pText=(XMLCSTR*)realloc(d->pText,d->nText*sizeof(XMLSTR));
+    if (d->pClear)     d->pClear=(XMLClear *)realloc(d->pClear,d->nClear*sizeof(XMLClear));
+}
+
+char XMLNode::maybeAddTxT(void *pa, XMLCSTR tokenPStr)
+{
+    XML *pXML=(XML *)pa;
+    XMLCSTR lpszText=pXML->lpszText;
+    if (!lpszText) return 0;
+    if (dropWhiteSpace) while (XML_isSPACECHAR(*lpszText)&&(lpszText!=tokenPStr)) lpszText++;
+    int cbText = (int)(tokenPStr - lpszText);
+    if (!cbText) { pXML->lpszText=NULL; return 0; }
+    if (dropWhiteSpace) { cbText--; while ((cbText)&&XML_isSPACECHAR(lpszText[cbText])) cbText--; cbText++; }
+    if (!cbText) { pXML->lpszText=NULL; return 0; }
+    XMLSTR lpt=fromXMLString(lpszText,cbText,pXML);
+    if (!lpt) return 1;
+    pXML->lpszText=NULL;
+    if (removeCommentsInMiddleOfText && d->nText && d->nClear)
+    {
+        // if the previous insertion was a comment (<!-- -->) AND
+        // if the previous previous insertion was a text then, delete the comment and append the text
+        int n=d->nChild+d->nText+d->nClear-1,*o=d->pOrder;
+        if (((o[n]&3)==eNodeClear)&&((o[n-1]&3)==eNodeText))
+        {
+            int i=o[n]>>2;
+            if (d->pClear[i].lpszOpenTag==XMLClearTags[2].lpszOpen)
+            {
+                deleteClear(i);
+                i=o[n-1]>>2;
+                n=xstrlen(d->pText[i]);
+                int n2=xstrlen(lpt)+1;
+                d->pText[i]=(XMLSTR)realloc((void*)d->pText[i],(n+n2)*sizeof(XMLCHAR));
+                if (!d->pText[i]) return 1;
+                memcpy((void*)(d->pText[i]+n),lpt,n2*sizeof(XMLCHAR));
+                free(lpt);
+                return 0;
+            }
+        }
+    }
+    addText_priv(MEMORYINCREASE,lpt,-1);
+    return 0;
+}
+// private:
+// Recursively parse an XML element.
+int XMLNode::ParseXMLElement(void *pa)
+{
+    XML *pXML=(XML *)pa;
+    int cbToken;
+    enum XMLTokenTypeTag xtype;
+    NextToken token;
+    XMLCSTR lpszTemp=NULL;
+    int cbTemp=0;
+    char nDeclaration;
+    XMLNode pNew;
+    enum Status status; // inside or outside a tag
+    enum Attrib attrib = eAttribName;
+
+    assert(pXML);
+
+    // If this is the first call to the function
+    if (pXML->nFirst)
+    {
+        // Assume we are outside of a tag definition
+        pXML->nFirst = FALSE;
+        status = eOutsideTag;
+    } else
+    {
+        // If this is not the first call then we should only be called when inside a tag.
+        status = eInsideTag;
+    }
+
+    // Iterate through the tokens in the document
+    for(;;)
+    {
+        // Obtain the next token
+        token = GetNextToken(pXML, &cbToken, &xtype);
+
+        if (xtype != eTokenError)
+        {
+            // Check the current status
+            switch(status)
+            {
+
+            // If we are outside of a tag definition
+            case eOutsideTag:
+
+                // Check what type of token we obtained
+                switch(xtype)
+                {
+                // If we have found text or quoted text
+                case eTokenText:
+                case eTokenCloseTag:          /* '>'         */
+                case eTokenShortHandClose:    /* '/>'        */
+                case eTokenQuotedText:
+                case eTokenEquals:
+                    break;
+
+                // If we found a start tag '<' and declarations '<?'
+                case eTokenTagStart:
+                case eTokenDeclaration:
+
+                    // Cache whether this new element is a declaration or not
+                    nDeclaration = (xtype == eTokenDeclaration);
+
+                    // If we have node text then add this to the element
+                    if (maybeAddTxT(pXML,token.pStr)) return FALSE;
+
+                    // Find the name of the tag
+                    token = GetNextToken(pXML, &cbToken, &xtype);
+
+                    // Return an error if we couldn't obtain the next token or
+                    // it wasnt text
+                    if (xtype != eTokenText)
+                    {
+                        pXML->error = eXMLErrorMissingTagName;
+                        return FALSE;
+                    }
+
+                    // If we found a new element which is the same as this
+                    // element then we need to pass this back to the caller..
+
+#ifdef APPROXIMATE_PARSING
+                    if (d->lpszName &&
+                        myTagCompare(d->lpszName, token.pStr) == 0)
+                    {
+                        // Indicate to the caller that it needs to create a
+                        // new element.
+                        pXML->lpNewElement = token.pStr;
+                        pXML->cbNewElement = cbToken;
+                        return TRUE;
+                    } else
+#endif
+                    {
+                        // If the name of the new element differs from the name of
+                        // the current element we need to add the new element to
+                        // the current one and recurse
+                        pNew = addChild_priv(MEMORYINCREASE,stringDup(token.pStr,cbToken), nDeclaration,-1);
+
+                        while (!pNew.isEmpty())
+                        {
+                            // Callself to process the new node.  If we return
+                            // FALSE this means we dont have any more
+                            // processing to do...
+
+                            if (!pNew.ParseXMLElement(pXML)) return FALSE;
+                            else
+                            {
+                                // If the call to recurse this function
+                                // evented in a end tag specified in XML then
+                                // we need to unwind the calls to this
+                                // function until we find the appropriate node
+                                // (the element name and end tag name must
+                                // match)
+                                if (pXML->cbEndTag)
+                                {
+                                    // If we are back at the root node then we
+                                    // have an unmatched end tag
+                                    if (!d->lpszName)
+                                    {
+                                        pXML->error=eXMLErrorUnmatchedEndTag;
+                                        return FALSE;
+                                    }
+
+                                    // If the end tag matches the name of this
+                                    // element then we only need to unwind
+                                    // once more...
+
+                                    if (myTagCompare(d->lpszName, pXML->lpEndTag)==0)
+                                    {
+                                        pXML->cbEndTag = 0;
+                                    }
+
+                                    return TRUE;
+                                } else
+                                    if (pXML->cbNewElement)
+                                    {
+                                        // If the call indicated a new element is to
+                                        // be created on THIS element.
+
+                                        // If the name of this element matches the
+                                        // name of the element we need to create
+                                        // then we need to return to the caller
+                                        // and let it process the element.
+
+                                        if (myTagCompare(d->lpszName, pXML->lpNewElement)==0)
+                                        {
+                                            return TRUE;
+                                        }
+
+                                        // Add the new element and recurse
+                                        pNew = addChild_priv(MEMORYINCREASE,stringDup(pXML->lpNewElement,pXML->cbNewElement),0,-1);
+                                        pXML->cbNewElement = 0;
+                                    }
+                                    else
+                                    {
+                                        // If we didn't have a new element to create
+                                        pNew = emptyXMLNode;
+
+                                    }
+                            }
+                        }
+                    }
+                    break;
+
+                // If we found an end tag
+                case eTokenTagEnd:
+
+                    // If we have node text then add this to the element
+                    if (maybeAddTxT(pXML,token.pStr)) return FALSE;
+
+                    // Find the name of the end tag
+                    token = GetNextToken(pXML, &cbTemp, &xtype);
+
+                    // The end tag should be text
+                    if (xtype != eTokenText)
+                    {
+                        pXML->error = eXMLErrorMissingEndTagName;
+                        return FALSE;
+                    }
+                    lpszTemp = token.pStr;
+
+                    // After the end tag we should find a closing tag
+                    token = GetNextToken(pXML, &cbToken, &xtype);
+                    if (xtype != eTokenCloseTag)
+                    {
+                        pXML->error = eXMLErrorMissingEndTagName;
+                        return FALSE;
+                    }
+                    pXML->lpszText=pXML->lpXML+pXML->nIndex;
+
+                    // We need to return to the previous caller.  If the name
+                    // of the tag cannot be found we need to keep returning to
+                    // caller until we find a match
+                    if (myTagCompare(d->lpszName, lpszTemp) != 0)
+#ifdef STRICT_PARSING
+                    {
+                        pXML->error=eXMLErrorUnmatchedEndTag;
+                        pXML->nIndexMissigEndTag=pXML->nIndex;
+                        return FALSE;
+                    }
+#else
+                    {
+                        pXML->error=eXMLErrorMissingEndTag;
+                        pXML->nIndexMissigEndTag=pXML->nIndex;
+                        pXML->lpEndTag = lpszTemp;
+                        pXML->cbEndTag = cbTemp;
+                    }
+#endif
+
+                    // Return to the caller
+                    exactMemory(d);
+                    return TRUE;
+
+                // If we found a clear (unformatted) token
+                case eTokenClear:
+                    // If we have node text then add this to the element
+                    if (maybeAddTxT(pXML,token.pStr)) return FALSE;
+                    if (parseClearTag(pXML, token.pClr)) return FALSE;
+                    pXML->lpszText=pXML->lpXML+pXML->nIndex;
+                    break;
+
+                default:
+                    break;
+                }
+                break;
+
+            // If we are inside a tag definition we need to search for attributes
+            case eInsideTag:
+
+                // Check what part of the attribute (name, equals, value) we
+                // are looking for.
+                switch(attrib)
+                {
+                // If we are looking for a new attribute
+                case eAttribName:
+
+                    // Check what the current token type is
+                    switch(xtype)
+                    {
+                    // If the current type is text...
+                    // Eg.  'attribute'
+                    case eTokenText:
+                        // Cache the token then indicate that we are next to
+                        // look for the equals
+                        lpszTemp = token.pStr;
+                        cbTemp = cbToken;
+                        attrib = eAttribEquals;
+                        break;
+
+                    // If we found a closing tag...
+                    // Eg.  '>'
+                    case eTokenCloseTag:
+                        // We are now outside the tag
+                        status = eOutsideTag;
+                        pXML->lpszText=pXML->lpXML+pXML->nIndex;
+                        break;
+
+                    // If we found a short hand '/>' closing tag then we can
+                    // return to the caller
+                    case eTokenShortHandClose:
+                        exactMemory(d);
+                        pXML->lpszText=pXML->lpXML+pXML->nIndex;
+                        return TRUE;
+
+                    // Errors...
+                    case eTokenQuotedText:    /* '"SomeText"'   */
+                    case eTokenTagStart:      /* '<'            */
+                    case eTokenTagEnd:        /* '</'           */
+                    case eTokenEquals:        /* '='            */
+                    case eTokenDeclaration:   /* '<?'           */
+                    case eTokenClear:
+                        pXML->error = eXMLErrorUnexpectedToken;
+                        return FALSE;
+                    default: break;
+                    }
+                    break;
+
+                // If we are looking for an equals
+                case eAttribEquals:
+                    // Check what the current token type is
+                    switch(xtype)
+                    {
+                    // If the current type is text...
+                    // Eg.  'Attribute AnotherAttribute'
+                    case eTokenText:
+                        // Add the unvalued attribute to the list
+                        addAttribute_priv(MEMORYINCREASE,stringDup(lpszTemp,cbTemp), NULL);
+                        // Cache the token then indicate.  We are next to
+                        // look for the equals attribute
+                        lpszTemp = token.pStr;
+                        cbTemp = cbToken;
+                        break;
+
+                    // If we found a closing tag 'Attribute >' or a short hand
+                    // closing tag 'Attribute />'
+                    case eTokenShortHandClose:
+                    case eTokenCloseTag:
+                        // If we are a declaration element '<?' then we need
+                        // to remove extra closing '?' if it exists
+                        pXML->lpszText=pXML->lpXML+pXML->nIndex;
+
+                        if (d->isDeclaration &&
+                            (lpszTemp[cbTemp-1]) == _CXML('?'))
+                        {
+                            cbTemp--;
+                            if (d->pParent && d->pParent->pParent) xtype = eTokenShortHandClose;
+                        }
+
+                        if (cbTemp)
+                        {
+                            // Add the unvalued attribute to the list
+                            addAttribute_priv(MEMORYINCREASE,stringDup(lpszTemp,cbTemp), NULL);
+                        }
+
+                        // If this is the end of the tag then return to the caller
+                        if (xtype == eTokenShortHandClose)
+                        {
+                            exactMemory(d);
+                            return TRUE;
+                        }
+
+                        // We are now outside the tag
+                        status = eOutsideTag;
+                        break;
+
+                    // If we found the equals token...
+                    // Eg.  'Attribute ='
+                    case eTokenEquals:
+                        // Indicate that we next need to search for the value
+                        // for the attribute
+                        attrib = eAttribValue;
+                        break;
+
+                    // Errors...
+                    case eTokenQuotedText:    /* 'Attribute "InvalidAttr"'*/
+                    case eTokenTagStart:      /* 'Attribute <'            */
+                    case eTokenTagEnd:        /* 'Attribute </'           */
+                    case eTokenDeclaration:   /* 'Attribute <?'           */
+                    case eTokenClear:
+                        pXML->error = eXMLErrorUnexpectedToken;
+                        return FALSE;
+                    default: break;
+                    }
+                    break;
+
+                // If we are looking for an attribute value
+                case eAttribValue:
+                    // Check what the current token type is
+                    switch(xtype)
+                    {
+                    // If the current type is text or quoted text...
+                    // Eg.  'Attribute = "Value"' or 'Attribute = Value' or
+                    // 'Attribute = 'Value''.
+                    case eTokenText:
+                    case eTokenQuotedText:
+                        // If we are a declaration element '<?' then we need
+                        // to remove extra closing '?' if it exists
+                        if (d->isDeclaration &&
+                            (token.pStr[cbToken-1]) == _CXML('?'))
+                        {
+                            cbToken--;
+                        }
+
+                        if (cbTemp)
+                        {
+                            // Add the valued attribute to the list
+                            if (xtype==eTokenQuotedText) { token.pStr++; cbToken-=2; }
+                            XMLSTR attrVal=(XMLSTR)token.pStr;
+                            if (attrVal)
+                            {
+                                attrVal=fromXMLString(attrVal,cbToken,pXML);
+                                if (!attrVal) return FALSE;
+                            }
+                            addAttribute_priv(MEMORYINCREASE,stringDup(lpszTemp,cbTemp),attrVal);
+                        }
+
+                        // Indicate we are searching for a new attribute
+                        attrib = eAttribName;
+                        break;
+
+                    // Errors...
+                    case eTokenTagStart:        /* 'Attr = <'          */
+                    case eTokenTagEnd:          /* 'Attr = </'         */
+                    case eTokenCloseTag:        /* 'Attr = >'          */
+                    case eTokenShortHandClose:  /* "Attr = />"         */
+                    case eTokenEquals:          /* 'Attr = ='          */
+                    case eTokenDeclaration:     /* 'Attr = <?'         */
+                    case eTokenClear:
+                        pXML->error = eXMLErrorUnexpectedToken;
+                        return FALSE;
+                        break;
+                    default: break;
+                    }
+                }
+            }
+        }
+        // If we failed to obtain the next token
+        else
+        {
+            if ((!d->isDeclaration)&&(d->pParent))
+            {
+#ifdef STRICT_PARSING
+                pXML->error=eXMLErrorUnmatchedEndTag;
+#else
+                pXML->error=eXMLErrorMissingEndTag;
+#endif
+                pXML->nIndexMissigEndTag=pXML->nIndex;
+            }
+            maybeAddTxT(pXML,pXML->lpXML+pXML->nIndex);
+            return FALSE;
+        }
+    }
+}
+
+// Count the number of lines and columns in an XML string.
+static void CountLinesAndColumns(XMLCSTR lpXML, int nUpto, XMLResults *pResults)
+{
+    XMLCHAR ch;
+    assert(lpXML);
+    assert(pResults);
+
+    struct XML xml={ lpXML,lpXML, 0, 0, eXMLErrorNone, NULL, 0, NULL, 0, TRUE };
+
+    pResults->nLine = 1;
+    pResults->nColumn = 1;
+    while (xml.nIndex<nUpto)
+    {
+        ch = getNextChar(&xml);
+        if (ch != _CXML('\n')) pResults->nColumn++;
+        else
+        {
+            pResults->nLine++;
+            pResults->nColumn=1;
+        }
+    }
+}
+
+// Parse XML and return the root element.
+XMLNode XMLNode::parseString(XMLCSTR lpszXML, XMLCSTR tag, XMLResults *pResults)
+{
+    if (!lpszXML)
+    {
+        if (pResults)
+        {
+            pResults->error=eXMLErrorNoElements;
+            pResults->nLine=0;
+            pResults->nColumn=0;
+        }
+        return emptyXMLNode;
+    }
+
+    XMLNode xnode(NULL,NULL,FALSE);
+    struct XML xml={ lpszXML, lpszXML, 0, 0, eXMLErrorNone, NULL, 0, NULL, 0, TRUE };
+
+    // Create header element
+    xnode.ParseXMLElement(&xml);
+    enum XMLError error = xml.error;
+    if (!xnode.nChildNode()) error=eXMLErrorNoXMLTagFound;
+    if ((xnode.nChildNode()==1)&&(xnode.nElement()==1)) xnode=xnode.getChildNode(); // skip the empty node
+
+    // If no error occurred
+    if ((error==eXMLErrorNone)||(error==eXMLErrorMissingEndTag)||(error==eXMLErrorNoXMLTagFound))
+    {
+        XMLCSTR name=xnode.getName();
+        if (tag&&(*tag)&&((!name)||(xstricmp(name,tag))))
+        {
+            xnode=xnode.getChildNode(tag);
+            if (xnode.isEmpty())
+            {
+                if (pResults)
+                {
+                    pResults->error=eXMLErrorFirstTagNotFound;
+                    pResults->nLine=0;
+                    pResults->nColumn=0;
+                }
+                return emptyXMLNode;
+            }
+        }
+    } else
+    {
+        // Cleanup: this will destroy all the nodes
+        xnode = emptyXMLNode;
+    }
+
+
+    // If we have been given somewhere to place results
+    if (pResults)
+    {
+        pResults->error = error;
+
+        // If we have an error
+        if (error!=eXMLErrorNone)
+        {
+            if (error==eXMLErrorMissingEndTag) xml.nIndex=xml.nIndexMissigEndTag;
+            // Find which line and column it starts on.
+            CountLinesAndColumns(xml.lpXML, xml.nIndex, pResults);
+        }
+    }
+    return xnode;
+}
+
+XMLNode XMLNode::parseFile(XMLCSTR filename, XMLCSTR tag, XMLResults *pResults)
+{
+    if (pResults) { pResults->nLine=0; pResults->nColumn=0; }
+    FILE *f=xfopen(filename,_CXML("rb"));
+    if (f==NULL) { if (pResults) pResults->error=eXMLErrorFileNotFound; return emptyXMLNode; }
+    fseek(f,0,SEEK_END);
+    int l=ftell(f),headerSz=0;
+    if (!l) { if (pResults) pResults->error=eXMLErrorEmpty; fclose(f); return emptyXMLNode; }
+    fseek(f,0,SEEK_SET);
+    unsigned char *buf=(unsigned char*)malloc(l+4);
+    l=fread(buf,1,l,f);
+    fclose(f);
+    buf[l]=0;buf[l+1]=0;buf[l+2]=0;buf[l+3]=0;
+#ifdef _XMLWIDECHAR
+    if (guessWideCharChars)
+    {
+        if (!myIsTextWideChar(buf,l))
+        {
+            XMLNode::XMLCharEncoding ce=XMLNode::char_encoding_legacy;
+            if ((buf[0]==0xef)&&(buf[1]==0xbb)&&(buf[2]==0xbf)) { headerSz=3; ce=XMLNode::char_encoding_UTF8; }
+            XMLSTR b2=myMultiByteToWideChar((const char*)(buf+headerSz),ce);
+            free(buf); buf=(unsigned char*)b2; headerSz=0;
+        } else
+        {
+            if ((buf[0]==0xef)&&(buf[1]==0xff)) headerSz=2;
+            if ((buf[0]==0xff)&&(buf[1]==0xfe)) headerSz=2;
+        }
+    }
+#else
+    if (guessWideCharChars)
+    {
+        if (myIsTextWideChar(buf,l))
+        {
+            if ((buf[0]==0xef)&&(buf[1]==0xff)) headerSz=2;
+            if ((buf[0]==0xff)&&(buf[1]==0xfe)) headerSz=2;
+            char *b2=myWideCharToMultiByte((const wchar_t*)(buf+headerSz));
+            free(buf); buf=(unsigned char*)b2; headerSz=0;
+        } else
+        {
+            if ((buf[0]==0xef)&&(buf[1]==0xbb)&&(buf[2]==0xbf)) headerSz=3;
+        }
+    }
+#endif
+
+    if (!buf) { if (pResults) pResults->error=eXMLErrorCharConversionError; return emptyXMLNode; }
+    XMLNode x=parseString((XMLSTR)(buf+headerSz),tag,pResults);
+    free(buf);
+    return x;
+}
+
+static inline void charmemset(XMLSTR dest,XMLCHAR c,int l) { while (l--) *(dest++)=c; }
+// private:
+// Creates an user friendly XML string from a given element with
+// appropriate white space and carriage returns.
+//
+// This recurses through all subnodes then adds contents of the nodes to the
+// string.
+int XMLNode::CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker, int nFormat)
+{
+    int nResult = 0;
+    int cb=nFormat<0?0:nFormat;
+    int cbElement;
+    int nChildFormat=-1;
+    int nElementI=pEntry->nChild+pEntry->nText+pEntry->nClear;
+    int i,j;
+    if ((nFormat>=0)&&(nElementI==1)&&(pEntry->nText==1)&&(!pEntry->isDeclaration)) nFormat=-2;
+
+    assert(pEntry);
+
+#define LENSTR(lpsz) (lpsz ? xstrlen(lpsz) : 0)
+
+    // If the element has no name then assume this is the head node.
+    cbElement = (int)LENSTR(pEntry->lpszName);
+
+    if (cbElement)
+    {
+        // "<elementname "
+        if (lpszMarker)
+        {
+            if (cb) charmemset(lpszMarker, INDENTCHAR, cb);
+            nResult = cb;
+            lpszMarker[nResult++]=_CXML('<');
+            if (pEntry->isDeclaration) lpszMarker[nResult++]=_CXML('?');
+            xstrcpy(&lpszMarker[nResult], pEntry->lpszName);
+            nResult+=cbElement;
+            lpszMarker[nResult++]=_CXML(' ');
+
+        } else
+        {
+            nResult+=cbElement+2+cb;
+            if (pEntry->isDeclaration) nResult++;
+        }
+
+        // Enumerate attributes and add them to the string
+        XMLAttribute *pAttr=pEntry->pAttribute;
+        for (i=0; i<pEntry->nAttribute; i++)
+        {
+            // "Attrib
+            cb = (int)LENSTR(pAttr->lpszName);
+            if (cb)
+            {
+                if (lpszMarker) xstrcpy(&lpszMarker[nResult], pAttr->lpszName);
+                nResult += cb;
+                // "Attrib=Value "
+                if (pAttr->lpszValue)
+                {
+                    cb=(int)ToXMLStringTool::lengthXMLString(pAttr->lpszValue);
+                    if (lpszMarker)
+                    {
+                        lpszMarker[nResult]=_CXML('=');
+                        lpszMarker[nResult+1]=_CXML('"');
+                        if (cb) ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult+2],pAttr->lpszValue);
+                        lpszMarker[nResult+cb+2]=_CXML('"');
+                    }
+                    nResult+=cb+3;
+                }
+                if (lpszMarker) lpszMarker[nResult] = _CXML(' ');
+                nResult++;
+            }
+            pAttr++;
+        }
+
+        if (pEntry->isDeclaration)
+        {
+            if (lpszMarker)
+            {
+                lpszMarker[nResult-1]=_CXML('?');
+                lpszMarker[nResult]=_CXML('>');
+            }
+            nResult++;
+            if (nFormat!=-1)
+            {
+                if (lpszMarker) lpszMarker[nResult]=_CXML('\n');
+                nResult++;
+            }
+        } else
+            // If there are child nodes we need to terminate the start tag
+            if (nElementI)
+            {
+                if (lpszMarker) lpszMarker[nResult-1]=_CXML('>');
+                if (nFormat>=0)
+                {
+                    if (lpszMarker) lpszMarker[nResult]=_CXML('\n');
+                    nResult++;
+                }
+            } else nResult--;
+    }
+
+    // Calculate the child format for when we recurse.  This is used to
+    // determine the number of spaces used for prefixes.
+    if (nFormat!=-1)
+    {
+        if (cbElement&&(!pEntry->isDeclaration)) nChildFormat=nFormat+1;
+        else nChildFormat=nFormat;
+    }
+
+    // Enumerate through remaining children
+    for (i=0; i<nElementI; i++)
+    {
+        j=pEntry->pOrder[i];
+        switch((XMLElementType)(j&3))
+        {
+        // Text nodes
+        case eNodeText:
+            {
+                // "Text"
+                XMLCSTR pChild=pEntry->pText[j>>2];
+                cb = (int)ToXMLStringTool::lengthXMLString(pChild);
+                if (cb)
+                {
+                    if (nFormat>=0)
+                    {
+                        if (lpszMarker)
+                        {
+                            charmemset(&lpszMarker[nResult],INDENTCHAR,nFormat+1);
+                            ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult+nFormat+1],pChild);
+                            lpszMarker[nResult+nFormat+1+cb]=_CXML('\n');
+                        }
+                        nResult+=cb+nFormat+2;
+                    } else
+                    {
+                        if (lpszMarker) ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult], pChild);
+                        nResult += cb;
+                    }
+                }
+                break;
+            }
+
+        // Clear type nodes
+        case eNodeClear:
+            {
+                XMLClear *pChild=pEntry->pClear+(j>>2);
+                // "OpenTag"
+                cb = (int)LENSTR(pChild->lpszOpenTag);
+                if (cb)
+                {
+                    if (nFormat!=-1)
+                    {
+                        if (lpszMarker)
+                        {
+                            charmemset(&lpszMarker[nResult], INDENTCHAR, nFormat+1);
+                            xstrcpy(&lpszMarker[nResult+nFormat+1], pChild->lpszOpenTag);
+                        }
+                        nResult+=cb+nFormat+1;
+                    }
+                    else
+                    {
+                        if (lpszMarker)xstrcpy(&lpszMarker[nResult], pChild->lpszOpenTag);
+                        nResult += cb;
+                    }
+                }
+
+                // "OpenTag Value"
+                cb = (int)LENSTR(pChild->lpszValue);
+                if (cb)
+                {
+                    if (lpszMarker) xstrcpy(&lpszMarker[nResult], pChild->lpszValue);
+                    nResult += cb;
+                }
+
+                // "OpenTag Value CloseTag"
+                cb = (int)LENSTR(pChild->lpszCloseTag);
+                if (cb)
+                {
+                    if (lpszMarker) xstrcpy(&lpszMarker[nResult], pChild->lpszCloseTag);
+                    nResult += cb;
+                }
+
+                if (nFormat!=-1)
+                {
+                    if (lpszMarker) lpszMarker[nResult] = _CXML('\n');
+                    nResult++;
+                }
+                break;
+            }
+
+        // Element nodes
+        case eNodeChild:
+            {
+                // Recursively add child nodes
+                nResult += CreateXMLStringR(pEntry->pChild[j>>2].d, lpszMarker ? lpszMarker + nResult : 0, nChildFormat);
+                break;
+            }
+        default: break;
+        }
+    }
+
+    if ((cbElement)&&(!pEntry->isDeclaration))
+    {
+        // If we have child entries we need to use long XML notation for
+        // closing the element - "<elementname>blah blah blah</elementname>"
+        if (nElementI)
+        {
+            // "</elementname>\0"
+            if (lpszMarker)
+            {
+                if (nFormat >=0)
+                {
+                    charmemset(&lpszMarker[nResult], INDENTCHAR,nFormat);
+                    nResult+=nFormat;
+                }
+
+                lpszMarker[nResult]=_CXML('<'); lpszMarker[nResult+1]=_CXML('/');
+                nResult += 2;
+                xstrcpy(&lpszMarker[nResult], pEntry->lpszName);
+                nResult += cbElement;
+
+                lpszMarker[nResult]=_CXML('>');
+                if (nFormat == -1) nResult++;
+                else
+                {
+                    lpszMarker[nResult+1]=_CXML('\n');
+                    nResult+=2;
+                }
+            } else
+            {
+                if (nFormat>=0) nResult+=cbElement+4+nFormat;
+                else if (nFormat==-1) nResult+=cbElement+3;
+                else nResult+=cbElement+4;
+            }
+        } else
+        {
+            // If there are no children we can use shorthand XML notation -
+            // "<elementname/>"
+            // "/>\0"
+            if (lpszMarker)
+            {
+                lpszMarker[nResult]=_CXML('/'); lpszMarker[nResult+1]=_CXML('>');
+                if (nFormat != -1) lpszMarker[nResult+2]=_CXML('\n');
+            }
+            nResult += nFormat == -1 ? 2 : 3;
+        }
+    }
+
+    return nResult;
+}
+
+#undef LENSTR
+
+// Create an XML string
+// @param       int nFormat             - 0 if no formatting is required
+//                                        otherwise nonzero for formatted text
+//                                        with carriage returns and indentation.
+// @param       int *pnSize             - [out] pointer to the size of the
+//                                        returned string not including the
+//                                        NULL terminator.
+// @return      XMLSTR                  - Allocated XML string, you must free
+//                                        this with free().
+XMLSTR XMLNode::createXMLString(int nFormat, int *pnSize) const
+{
+    if (!d) { if (pnSize) *pnSize=0; return NULL; }
+
+    XMLSTR lpszResult = NULL;
+    int cbStr;
+
+    // Recursively Calculate the size of the XML string
+    if (!dropWhiteSpace) nFormat=0;
+    nFormat = nFormat ? 0 : -1;
+    cbStr = CreateXMLStringR(d, 0, nFormat);
+    // Alllocate memory for the XML string + the NULL terminator and
+    // create the recursively XML string.
+    lpszResult=(XMLSTR)malloc((cbStr+1)*sizeof(XMLCHAR));
+    CreateXMLStringR(d, lpszResult, nFormat);
+    lpszResult[cbStr]=_CXML('\0');
+    if (pnSize) *pnSize = cbStr;
+    return lpszResult;
+}
+
+int XMLNode::detachFromParent(XMLNodeData *d)
+{
+    XMLNode *pa=d->pParent->pChild;
+    int i=0;
+    while (((void*)(pa[i].d))!=((void*)d)) i++;
+    d->pParent->nChild--;
+    if (d->pParent->nChild) memmove(pa+i,pa+i+1,(d->pParent->nChild-i)*sizeof(XMLNode));
+    else { free(pa); d->pParent->pChild=NULL; }
+    return removeOrderElement(d->pParent,eNodeChild,i);
+}
+
+XMLNode::~XMLNode()
+{
+    if (!d) return;
+    d->ref_count--;
+    emptyTheNode(0);
+}
+void XMLNode::deleteNodeContent()
+{
+    if (!d) return;
+    if (d->pParent) { detachFromParent(d); d->pParent=NULL; d->ref_count--; }
+    emptyTheNode(1);
+}
+void XMLNode::emptyTheNode(char force)
+{
+    XMLNodeData *dd=d; // warning: must stay this way!
+    if ((dd->ref_count==0)||force)
+    {
+        if (d->pParent) detachFromParent(d);
+        int i;
+        XMLNode *pc;
+        for(i=0; i<dd->nChild; i++)
+        {
+            pc=dd->pChild+i;
+            pc->d->pParent=NULL;
+            pc->d->ref_count--;
+            pc->emptyTheNode(force);
+        }
+        myFree(dd->pChild);
+        for(i=0; i<dd->nText; i++) free((void*)dd->pText[i]);
+        myFree(dd->pText);
+        for(i=0; i<dd->nClear; i++) free((void*)dd->pClear[i].lpszValue);
+        myFree(dd->pClear);
+        for(i=0; i<dd->nAttribute; i++)
+        {
+            free((void*)dd->pAttribute[i].lpszName);
+            if (dd->pAttribute[i].lpszValue) free((void*)dd->pAttribute[i].lpszValue);
+        }
+        myFree(dd->pAttribute);
+        myFree(dd->pOrder);
+        myFree((void*)dd->lpszName);
+        dd->nChild=0;    dd->nText=0;    dd->nClear=0;    dd->nAttribute=0;
+        dd->pChild=NULL; dd->pText=NULL; dd->pClear=NULL; dd->pAttribute=NULL;
+        dd->pOrder=NULL; dd->lpszName=NULL; dd->pParent=NULL;
+    }
+    if (dd->ref_count==0)
+    {
+        free(dd);
+        d=NULL;
+    }
+}
+
+XMLNode& XMLNode::operator=( const XMLNode& A )
+{
+    // shallow copy
+    if (this != &A)
+    {
+        if (d) { d->ref_count--; emptyTheNode(0); }
+        d=A.d;
+        if (d) (d->ref_count) ++ ;
+    }
+    return *this;
+}
+
+XMLNode::XMLNode(const XMLNode &A)
+{
+    // shallow copy
+    d=A.d;
+    if (d) (d->ref_count)++ ;
+}
+
+XMLNode XMLNode::deepCopy() const
+{
+    if (!d) return XMLNode::emptyXMLNode;
+    XMLNode x(NULL,stringDup(d->lpszName),d->isDeclaration);
+    XMLNodeData *p=x.d;
+    int n=d->nAttribute;
+    if (n)
+    {
+        p->nAttribute=n; p->pAttribute=(XMLAttribute*)malloc(n*sizeof(XMLAttribute));
+        while (n--)
+        {
+            p->pAttribute[n].lpszName=stringDup(d->pAttribute[n].lpszName);
+            p->pAttribute[n].lpszValue=stringDup(d->pAttribute[n].lpszValue);
+        }
+    }
+    if (d->pOrder)
+    {
+        n=(d->nChild+d->nText+d->nClear)*sizeof(int); p->pOrder=(int*)malloc(n); memcpy(p->pOrder,d->pOrder,n);
+    }
+    n=d->nText;
+    if (n)
+    {
+        p->nText=n; p->pText=(XMLCSTR*)malloc(n*sizeof(XMLCSTR));
+        while(n--) p->pText[n]=stringDup(d->pText[n]);
+    }
+    n=d->nClear;
+    if (n)
+    {
+        p->nClear=n; p->pClear=(XMLClear*)malloc(n*sizeof(XMLClear));
+        while (n--)
+        {
+            p->pClear[n].lpszCloseTag=d->pClear[n].lpszCloseTag;
+            p->pClear[n].lpszOpenTag=d->pClear[n].lpszOpenTag;
+            p->pClear[n].lpszValue=stringDup(d->pClear[n].lpszValue);
+        }
+    }
+    n=d->nChild;
+    if (n)
+    {
+        p->nChild=n; p->pChild=(XMLNode*)malloc(n*sizeof(XMLNode));
+        while (n--)
+        {
+            p->pChild[n].d=NULL;
+            p->pChild[n]=d->pChild[n].deepCopy();
+            p->pChild[n].d->pParent=p;
+        }
+    }
+    return x;
+}
+
+XMLNode XMLNode::addChild(XMLNode childNode, int pos)
+{
+    XMLNodeData *dc=childNode.d;
+    if ((!dc)||(!d)) return childNode;
+    if (!dc->lpszName)
+    {
+        // this is a root node: todo: correct fix
+        int j=pos;
+        while (dc->nChild)
+        {
+            addChild(dc->pChild[0],j);
+            if (pos>=0) j++;
+        }
+        return childNode;
+    }
+    if (dc->pParent) { if ((detachFromParent(dc)<=pos)&&(dc->pParent==d)) pos--; } else dc->ref_count++;
+    dc->pParent=d;
+//     int nc=d->nChild;
+//     d->pChild=(XMLNode*)myRealloc(d->pChild,(nc+1),memoryIncrease,sizeof(XMLNode));
+    d->pChild=(XMLNode*)addToOrder(0,&pos,d->nChild,d->pChild,sizeof(XMLNode),eNodeChild);
+    d->pChild[pos].d=dc;
+    d->nChild++;
+    return childNode;
+}
+
+void XMLNode::deleteAttribute(int i)
+{
+    if ((!d)||(i<0)||(i>=d->nAttribute)) return;
+    d->nAttribute--;
+    XMLAttribute *p=d->pAttribute+i;
+    free((void*)p->lpszName);
+    if (p->lpszValue) free((void*)p->lpszValue);
+    if (d->nAttribute) memmove(p,p+1,(d->nAttribute-i)*sizeof(XMLAttribute)); else { free(p); d->pAttribute=NULL; }
+}
+
+void XMLNode::deleteAttribute(XMLAttribute *a){ if (a) deleteAttribute(a->lpszName); }
+void XMLNode::deleteAttribute(XMLCSTR lpszName)
+{
+    int j=0;
+    getAttribute(lpszName,&j);
+    if (j) deleteAttribute(j-1);
+}
+
+XMLAttribute *XMLNode::updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName,int i)
+{
+    if (!d) { if (lpszNewValue) free(lpszNewValue); if (lpszNewName) free(lpszNewName); return NULL; }
+    if (i>=d->nAttribute)
+    {
+        if (lpszNewName) return addAttribute_WOSD(lpszNewName,lpszNewValue);
+        return NULL;
+    }
+    XMLAttribute *p=d->pAttribute+i;
+    if (p->lpszValue&&p->lpszValue!=lpszNewValue) free((void*)p->lpszValue);
+    p->lpszValue=lpszNewValue;
+    if (lpszNewName&&p->lpszName!=lpszNewName) { free((void*)p->lpszName); p->lpszName=lpszNewName; };
+    return p;
+}
+
+XMLAttribute *XMLNode::updateAttribute_WOSD(XMLAttribute *newAttribute, XMLAttribute *oldAttribute)
+{
+    if (oldAttribute) return updateAttribute_WOSD((XMLSTR)newAttribute->lpszValue,(XMLSTR)newAttribute->lpszName,oldAttribute->lpszName);
+    return addAttribute_WOSD((XMLSTR)newAttribute->lpszName,(XMLSTR)newAttribute->lpszValue);
+}
+
+XMLAttribute *XMLNode::updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName,XMLCSTR lpszOldName)
+{
+    int j=0;
+    getAttribute(lpszOldName,&j);
+    if (j) return updateAttribute_WOSD(lpszNewValue,lpszNewName,j-1);
+    else
+    {
+        if (lpszNewName) return addAttribute_WOSD(lpszNewName,lpszNewValue);
+        else             return addAttribute_WOSD(stringDup(lpszOldName),lpszNewValue);
+    }
+}
+
+int XMLNode::indexText(XMLCSTR lpszValue) const
+{
+    if (!d) return -1;
+    int i,l=d->nText;
+    if (!lpszValue) { if (l) return 0; return -1; }
+    XMLCSTR *p=d->pText;
+    for (i=0; i<l; i++) if (lpszValue==p[i]) return i;
+    return -1;
+}
+
+void XMLNode::deleteText(int i)
+{
+    if ((!d)||(i<0)||(i>=d->nText)) return;
+    d->nText--;
+    XMLCSTR *p=d->pText+i;
+    free((void*)*p);
+    if (d->nText) memmove(p,p+1,(d->nText-i)*sizeof(XMLCSTR)); else { free(p); d->pText=NULL; }
+    removeOrderElement(d,eNodeText,i);
+}
+
+void XMLNode::deleteText(XMLCSTR lpszValue) { deleteText(indexText(lpszValue)); }
+
+XMLCSTR XMLNode::updateText_WOSD(XMLSTR lpszNewValue, int i)
+{
+    if (!d) { if (lpszNewValue) free(lpszNewValue); return NULL; }
+    if (i>=d->nText) return addText_WOSD(lpszNewValue);
+    XMLCSTR *p=d->pText+i;
+    if (*p!=lpszNewValue) { free((void*)*p); *p=lpszNewValue; }
+    return lpszNewValue;
+}
+
+XMLCSTR XMLNode::updateText_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue)
+{
+    if (!d) { if (lpszNewValue) free(lpszNewValue); return NULL; }
+    int i=indexText(lpszOldValue);
+    if (i>=0) return updateText_WOSD(lpszNewValue,i);
+    return addText_WOSD(lpszNewValue);
+}
+
+void XMLNode::deleteClear(int i)
+{
+    if ((!d)||(i<0)||(i>=d->nClear)) return;
+    d->nClear--;
+    XMLClear *p=d->pClear+i;
+    free((void*)p->lpszValue);
+    if (d->nClear) memmove(p,p+1,(d->nClear-i)*sizeof(XMLClear)); else { free(p); d->pClear=NULL; }
+    removeOrderElement(d,eNodeClear,i);
+}
+
+int XMLNode::indexClear(XMLCSTR lpszValue) const
+{
+    if (!d) return -1;
+    int i,l=d->nClear;
+    if (!lpszValue) { if (l) return 0; return -1; }
+    XMLClear *p=d->pClear;
+    for (i=0; i<l; i++) if (lpszValue==p[i].lpszValue) return i;
+    return -1;
+}
+
+void XMLNode::deleteClear(XMLCSTR lpszValue) { deleteClear(indexClear(lpszValue)); }
+void XMLNode::deleteClear(XMLClear *a) { if (a) deleteClear(a->lpszValue); }
+
+XMLClear *XMLNode::updateClear_WOSD(XMLSTR lpszNewContent, int i)
+{
+    if (!d) { if (lpszNewContent) free(lpszNewContent); return NULL; }
+    if (i>=d->nClear) return addClear_WOSD(lpszNewContent);
+    XMLClear *p=d->pClear+i;
+    if (lpszNewContent!=p->lpszValue) { free((void*)p->lpszValue); p->lpszValue=lpszNewContent; }
+    return p;
+}
+
+XMLClear *XMLNode::updateClear_WOSD(XMLSTR lpszNewContent, XMLCSTR lpszOldValue)
+{
+    if (!d) { if (lpszNewContent) free(lpszNewContent); return NULL; }
+    int i=indexClear(lpszOldValue);
+    if (i>=0) return updateClear_WOSD(lpszNewContent,i);
+    return addClear_WOSD(lpszNewContent);
+}
+
+XMLClear *XMLNode::updateClear_WOSD(XMLClear *newP,XMLClear *oldP)
+{
+    if (oldP) return updateClear_WOSD((XMLSTR)newP->lpszValue,(XMLSTR)oldP->lpszValue);
+    return NULL;
+}
+
+int XMLNode::nChildNode(XMLCSTR name) const
+{
+    if (!d) return 0;
+    int i,j=0,n=d->nChild;
+    XMLNode *pc=d->pChild;
+    for (i=0; i<n; i++)
+    {
+        if (xstricmp(pc->d->lpszName, name)==0) j++;
+        pc++;
+    }
+    return j;
+}
+
+XMLNode XMLNode::getChildNode(XMLCSTR name, int *j) const
+{
+    if (!d) return emptyXMLNode;
+    int i=0,n=d->nChild;
+    if (j) i=*j;
+    XMLNode *pc=d->pChild+i;
+    for (; i<n; i++)
+    {
+        if (!xstricmp(pc->d->lpszName, name))
+        {
+            if (j) *j=i+1;
+            return *pc;
+        }
+        pc++;
+    }
+    return emptyXMLNode;
+}
+
+XMLNode XMLNode::getChildNode(XMLCSTR name, int j) const
+{
+    if (!d) return emptyXMLNode;
+    if (j>=0)
+    {
+        int i=0;
+        while (j-->0) getChildNode(name,&i);
+        return getChildNode(name,&i);
+    }
+    int i=d->nChild;
+    while (i--) if (!xstricmp(name,d->pChild[i].d->lpszName)) break;
+    if (i<0) return emptyXMLNode;
+    return getChildNode(i);
+}
+
+XMLNode XMLNode::getChildNodeByPath(XMLCSTR _path, char createMissing, XMLCHAR sep)
+{
+    XMLSTR path=stringDup(_path);
+    XMLNode x=getChildNodeByPathNonConst(path,createMissing,sep);
+    if (path) free(path);
+    return x;
+}
+
+XMLNode XMLNode::getChildNodeByPathNonConst(XMLSTR path, char createIfMissing, XMLCHAR sep)
+{
+    if ((!path)||(!(*path))) return *this;
+    XMLNode xn,xbase=*this;
+    XMLCHAR *tend1,sepString[2]; sepString[0]=sep; sepString[1]=0;
+    tend1=xstrstr(path,sepString);
+    while(tend1)
+    {
+        *tend1=0;
+        xn=xbase.getChildNode(path);
+        if (xn.isEmpty())
+        {
+            if (createIfMissing) xn=xbase.addChild(path);
+            else { *tend1=sep; return XMLNode::emptyXMLNode; }
+        }
+        *tend1=sep;
+        xbase=xn;
+        path=tend1+1;
+        tend1=xstrstr(path,sepString);
+    }
+    xn=xbase.getChildNode(path);
+    if (xn.isEmpty()&&createIfMissing) xn=xbase.addChild(path);
+    return xn;
+}
+
+XMLElementPosition XMLNode::positionOfText     (int i) const { if (i>=d->nText ) i=d->nText-1;  return findPosition(d,i,eNodeText ); }
+XMLElementPosition XMLNode::positionOfClear    (int i) const { if (i>=d->nClear) i=d->nClear-1; return findPosition(d,i,eNodeClear); }
+XMLElementPosition XMLNode::positionOfChildNode(int i) const { if (i>=d->nChild) i=d->nChild-1; return findPosition(d,i,eNodeChild); }
+XMLElementPosition XMLNode::positionOfText (XMLCSTR lpszValue) const { return positionOfText (indexText (lpszValue)); }
+XMLElementPosition XMLNode::positionOfClear(XMLCSTR lpszValue) const { return positionOfClear(indexClear(lpszValue)); }
+XMLElementPosition XMLNode::positionOfClear(XMLClear *a) const { if (a) return positionOfClear(a->lpszValue); return positionOfClear(); }
+XMLElementPosition XMLNode::positionOfChildNode(XMLNode x)  const
+{
+    if ((!d)||(!x.d)) return -1;
+    XMLNodeData *dd=x.d;
+    XMLNode *pc=d->pChild;
+    int i=d->nChild;
+    while (i--) if (pc[i].d==dd) return findPosition(d,i,eNodeChild);
+    return -1;
+}
+XMLElementPosition XMLNode::positionOfChildNode(XMLCSTR name, int count) const
+{
+    if (!name) return positionOfChildNode(count);
+    int j=0;
+    do { getChildNode(name,&j); if (j<0) return -1; } while (count--);
+    return findPosition(d,j-1,eNodeChild);
+}
+
+XMLNode XMLNode::getChildNodeWithAttribute(XMLCSTR name,XMLCSTR attributeName,XMLCSTR attributeValue, int *k) const
+{
+     int i=0,j;
+     if (k) i=*k;
+     XMLNode x;
+     XMLCSTR t;
+     do
+     {
+         x=getChildNode(name,&i);
+         if (!x.isEmpty())
+         {
+             if (attributeValue)
+             {
+                 j=0;
+                 do
+                 {
+                     t=x.getAttribute(attributeName,&j);
+                     if (t&&(xstricmp(attributeValue,t)==0)) { if (k) *k=i; return x; }
+                 } while (t);
+             } else
+             {
+                 if (x.isAttributeSet(attributeName)) { if (k) *k=i; return x; }
+             }
+         }
+     } while (!x.isEmpty());
+     return emptyXMLNode;
+}
+
+// Find an attribute on an node.
+XMLCSTR XMLNode::getAttribute(XMLCSTR lpszAttrib, int *j) const
+{
+    if (!d) return NULL;
+    int i=0,n=d->nAttribute;
+    if (j) i=*j;
+    XMLAttribute *pAttr=d->pAttribute+i;
+    for (; i<n; i++)
+    {
+        if (xstricmp(pAttr->lpszName, lpszAttrib)==0)
+        {
+            if (j) *j=i+1;
+            return pAttr->lpszValue;
+        }
+        pAttr++;
+    }
+    return NULL;
+}
+
+char XMLNode::isAttributeSet(XMLCSTR lpszAttrib) const
+{
+    if (!d) return FALSE;
+    int i,n=d->nAttribute;
+    XMLAttribute *pAttr=d->pAttribute;
+    for (i=0; i<n; i++)
+    {
+        if (xstricmp(pAttr->lpszName, lpszAttrib)==0)
+        {
+            return TRUE;
+        }
+        pAttr++;
+    }
+    return FALSE;
+}
+
+XMLCSTR XMLNode::getAttribute(XMLCSTR name, int j) const
+{
+    if (!d) return NULL;
+    int i=0;
+    while (j-->0) getAttribute(name,&i);
+    return getAttribute(name,&i);
+}
+
+XMLNodeContents XMLNode::enumContents(int i) const
+{
+    XMLNodeContents c;
+    if (!d) { c.etype=eNodeNULL; return c; }
+    if (i<d->nAttribute)
+    {
+        c.etype=eNodeAttribute;
+        c.attrib=d->pAttribute[i];
+        return c;
+    }
+    i-=d->nAttribute;
+    c.etype=(XMLElementType)(d->pOrder[i]&3);
+    i=(d->pOrder[i])>>2;
+    switch (c.etype)
+    {
+    case eNodeChild:     c.child = d->pChild[i];      break;
+    case eNodeText:      c.text  = d->pText[i];       break;
+    case eNodeClear:     c.clear = d->pClear[i];      break;
+    default: break;
+    }
+    return c;
+}
+
+XMLCSTR XMLNode::getName() const { if (!d) return NULL; return d->lpszName;   }
+int XMLNode::nText()       const { if (!d) return 0;    return d->nText;      }
+int XMLNode::nChildNode()  const { if (!d) return 0;    return d->nChild;     }
+int XMLNode::nAttribute()  const { if (!d) return 0;    return d->nAttribute; }
+int XMLNode::nClear()      const { if (!d) return 0;    return d->nClear;     }
+int XMLNode::nElement()    const { if (!d) return 0;    return d->nAttribute+d->nChild+d->nText+d->nClear; }
+XMLClear     XMLNode::getClear         (int i) const { if ((!d)||(i>=d->nClear    )) return emptyXMLClear;     return d->pClear[i];     }
+XMLAttribute XMLNode::getAttribute     (int i) const { if ((!d)||(i>=d->nAttribute)) return emptyXMLAttribute; return d->pAttribute[i]; }
+XMLCSTR      XMLNode::getAttributeName (int i) const { if ((!d)||(i>=d->nAttribute)) return NULL;              return d->pAttribute[i].lpszName;  }
+XMLCSTR      XMLNode::getAttributeValue(int i) const { if ((!d)||(i>=d->nAttribute)) return NULL;              return d->pAttribute[i].lpszValue; }
+XMLCSTR      XMLNode::getText          (int i) const { if ((!d)||(i>=d->nText     )) return NULL;              return d->pText[i];      }
+XMLNode      XMLNode::getChildNode     (int i) const { if ((!d)||(i>=d->nChild    )) return emptyXMLNode;      return d->pChild[i];     }
+XMLNode      XMLNode::getParentNode    (     ) const { if ((!d)||(!d->pParent     )) return emptyXMLNode;      return XMLNode(d->pParent); }
+char         XMLNode::isDeclaration    (     ) const { if (!d) return 0;             return d->isDeclaration; }
+char         XMLNode::isEmpty          (     ) const { return (d==NULL); }
+XMLNode       XMLNode::emptyNode       (     )       { return XMLNode::emptyXMLNode; }
+
+XMLNode       XMLNode::addChild(XMLCSTR lpszName, char isDeclaration, XMLElementPosition pos)
+              { return addChild_priv(0,stringDup(lpszName),isDeclaration,pos); }
+XMLNode       XMLNode::addChild_WOSD(XMLSTR lpszName, char isDeclaration, XMLElementPosition pos)
+              { return addChild_priv(0,lpszName,isDeclaration,pos); }
+XMLAttribute *XMLNode::addAttribute(XMLCSTR lpszName, XMLCSTR lpszValue)
+              { return addAttribute_priv(0,stringDup(lpszName),stringDup(lpszValue)); }
+XMLAttribute *XMLNode::addAttribute_WOSD(XMLSTR lpszName, XMLSTR lpszValuev)
+              { return addAttribute_priv(0,lpszName,lpszValuev); }
+XMLCSTR       XMLNode::addText(XMLCSTR lpszValue, XMLElementPosition pos)
+              { return addText_priv(0,stringDup(lpszValue),pos); }
+XMLCSTR       XMLNode::addText_WOSD(XMLSTR lpszValue, XMLElementPosition pos)
+              { return addText_priv(0,lpszValue,pos); }
+XMLClear     *XMLNode::addClear(XMLCSTR lpszValue, XMLCSTR lpszOpen, XMLCSTR lpszClose, XMLElementPosition pos)
+              { return addClear_priv(0,stringDup(lpszValue),lpszOpen,lpszClose,pos); }
+XMLClear     *XMLNode::addClear_WOSD(XMLSTR lpszValue, XMLCSTR lpszOpen, XMLCSTR lpszClose, XMLElementPosition pos)
+              { return addClear_priv(0,lpszValue,lpszOpen,lpszClose,pos); }
+XMLCSTR       XMLNode::updateName(XMLCSTR lpszName)
+              { return updateName_WOSD(stringDup(lpszName)); }
+XMLAttribute *XMLNode::updateAttribute(XMLAttribute *newAttribute, XMLAttribute *oldAttribute)
+              { return updateAttribute_WOSD(stringDup(newAttribute->lpszValue),stringDup(newAttribute->lpszName),oldAttribute->lpszName); }
+XMLAttribute *XMLNode::updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName,int i)
+              { return updateAttribute_WOSD(stringDup(lpszNewValue),stringDup(lpszNewName),i); }
+XMLAttribute *XMLNode::updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName,XMLCSTR lpszOldName)
+              { return updateAttribute_WOSD(stringDup(lpszNewValue),stringDup(lpszNewName),lpszOldName); }
+XMLCSTR       XMLNode::updateText(XMLCSTR lpszNewValue, int i)
+              { return updateText_WOSD(stringDup(lpszNewValue),i); }
+XMLCSTR       XMLNode::updateText(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue)
+              { return updateText_WOSD(stringDup(lpszNewValue),lpszOldValue); }
+XMLClear     *XMLNode::updateClear(XMLCSTR lpszNewContent, int i)
+              { return updateClear_WOSD(stringDup(lpszNewContent),i); }
+XMLClear     *XMLNode::updateClear(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue)
+              { return updateClear_WOSD(stringDup(lpszNewValue),lpszOldValue); }
+XMLClear     *XMLNode::updateClear(XMLClear *newP,XMLClear *oldP)
+              { return updateClear_WOSD(stringDup(newP->lpszValue),oldP->lpszValue); }
+
+char XMLNode::setGlobalOptions(XMLCharEncoding _characterEncoding, char _guessWideCharChars,
+                               char _dropWhiteSpace, char _removeCommentsInMiddleOfText)
+{
+    guessWideCharChars=_guessWideCharChars; dropWhiteSpace=_dropWhiteSpace; removeCommentsInMiddleOfText=_removeCommentsInMiddleOfText;
+#ifdef _XMLWIDECHAR
+    if (_characterEncoding) characterEncoding=_characterEncoding;
+#else
+    switch(_characterEncoding)
+    {
+    case char_encoding_UTF8:     characterEncoding=_characterEncoding; XML_ByteTable=XML_utf8ByteTable; break;
+    case char_encoding_legacy:   characterEncoding=_characterEncoding; XML_ByteTable=XML_legacyByteTable; break;
+    case char_encoding_ShiftJIS: characterEncoding=_characterEncoding; XML_ByteTable=XML_sjisByteTable; break;
+    case char_encoding_GB2312:   characterEncoding=_characterEncoding; XML_ByteTable=XML_gb2312ByteTable; break;
+    case char_encoding_Big5:
+    case char_encoding_GBK:      characterEncoding=_characterEncoding; XML_ByteTable=XML_gbk_big5_ByteTable; break;
+    default: return 1;
+    }
+#endif
+    return 0;
+}
+
+XMLNode::XMLCharEncoding XMLNode::guessCharEncoding(void *buf,int l, char useXMLEncodingAttribute)
+{
+#ifdef _XMLWIDECHAR
+    return (XMLCharEncoding)0;
+#else
+    if (l<25) return (XMLCharEncoding)0;
+    if (guessWideCharChars&&(myIsTextWideChar(buf,l))) return (XMLCharEncoding)0;
+    unsigned char *b=(unsigned char*)buf;
+    if ((b[0]==0xef)&&(b[1]==0xbb)&&(b[2]==0xbf)) return char_encoding_UTF8;
+
+    // Match utf-8 model ?
+    XMLCharEncoding bestGuess=char_encoding_UTF8;
+    int i=0;
+    while (i<l)
+        switch (XML_utf8ByteTable[b[i]])
+        {
+        case 4: i++; if ((i<l)&&(b[i]& 0xC0)!=0x80) { bestGuess=char_encoding_legacy; i=l; } // 10bbbbbb ?
+        case 3: i++; if ((i<l)&&(b[i]& 0xC0)!=0x80) { bestGuess=char_encoding_legacy; i=l; } // 10bbbbbb ?
+        case 2: i++; if ((i<l)&&(b[i]& 0xC0)!=0x80) { bestGuess=char_encoding_legacy; i=l; } // 10bbbbbb ?
+        case 1: i++; break;
+        case 0: i=l;
+        }
+    if (!useXMLEncodingAttribute) return bestGuess;
+    // if encoding is specified and different from utf-8 than it's non-utf8
+    // otherwise it's utf-8
+    char bb[201];
+    l=mmin(l,200);
+    memcpy(bb,buf,l); // copy buf into bb to be able to do "bb[l]=0"
+    bb[l]=0;
+    b=(unsigned char*)strstr(bb,"encoding");
+    if (!b) return bestGuess;
+    b+=8; while XML_isSPACECHAR(*b) b++; if (*b!='=') return bestGuess;
+    b++;  while XML_isSPACECHAR(*b) b++; if ((*b!='\'')&&(*b!='"')) return bestGuess;
+    b++;  while XML_isSPACECHAR(*b) b++;
+
+    if ((xstrnicmp((char*)b,"utf-8",5)==0)||
+        (xstrnicmp((char*)b,"utf8",4)==0))
+    {
+        if (bestGuess==char_encoding_legacy) return char_encoding_error;
+        return char_encoding_UTF8;
+    }
+
+    if ((xstrnicmp((char*)b,"shiftjis",8)==0)||
+        (xstrnicmp((char*)b,"shift-jis",9)==0)||
+        (xstrnicmp((char*)b,"sjis",4)==0)) return char_encoding_ShiftJIS;
+
+    if (xstrnicmp((char*)b,"GB2312",6)==0) return char_encoding_GB2312;
+    if (xstrnicmp((char*)b,"Big5",4)==0) return char_encoding_Big5;
+    if (xstrnicmp((char*)b,"GBK",3)==0) return char_encoding_GBK;
+
+    return char_encoding_legacy;
+#endif
+}
+#undef XML_isSPACECHAR
+
+//////////////////////////////////////////////////////////
+//      Here starts the base64 conversion functions.    //
+//////////////////////////////////////////////////////////
+
+static const char base64Fillchar = _CXML('='); // used to mark partial words at the end
+
+// this lookup table defines the base64 encoding
+XMLCSTR base64EncodeTable=_CXML("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
+
+// Decode Table gives the index of any valid base64 character in the Base64 table]
+// 96: '='  -   97: space char   -   98: illegal char   -   99: end of string
+const unsigned char base64DecodeTable[] = {
+    99,98,98,98,98,98,98,98,98,97,  97,98,98,97,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  //00 -29
+    98,98,97,98,98,98,98,98,98,98,  98,98,98,62,98,98,98,63,52,53,  54,55,56,57,58,59,60,61,98,98,  //30 -59
+    98,96,98,98,98, 0, 1, 2, 3, 4,   5, 6, 7, 8, 9,10,11,12,13,14,  15,16,17,18,19,20,21,22,23,24,  //60 -89
+    25,98,98,98,98,98,98,26,27,28,  29,30,31,32,33,34,35,36,37,38,  39,40,41,42,43,44,45,46,47,48,  //90 -119
+    49,50,51,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  //120 -149
+    98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  //150 -179
+    98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  //180 -209
+    98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98,98,98,98,98,  //210 -239
+    98,98,98,98,98,98,98,98,98,98,  98,98,98,98,98,98                                               //240 -255
+};
+
+XMLParserBase64Tool::~XMLParserBase64Tool(){ freeBuffer(); }
+
+void XMLParserBase64Tool::freeBuffer(){ if (buf) free(buf); buf=NULL; buflen=0; }
+
+int XMLParserBase64Tool::encodeLength(int inlen, char formatted)
+{
+    unsigned int i=((inlen-1)/3*4+4+1);
+    if (formatted) i+=inlen/54;
+    return i;
+}
+
+XMLSTR XMLParserBase64Tool::encode(unsigned char *inbuf, unsigned int inlen, char formatted)
+{
+    int i=encodeLength(inlen,formatted),k=17,eLen=inlen/3,j;
+    alloc(i*sizeof(XMLCHAR));
+    XMLSTR curr=(XMLSTR)buf;
+    for(i=0;i<eLen;i++)
+    {
+        // Copy next three bytes into lower 24 bits of int, paying attention to sign.
+        j=(inbuf[0]<<16)|(inbuf[1]<<8)|inbuf[2]; inbuf+=3;
+        // Encode the int into four chars
+        *(curr++)=base64EncodeTable[ j>>18      ];
+        *(curr++)=base64EncodeTable[(j>>12)&0x3f];
+        *(curr++)=base64EncodeTable[(j>> 6)&0x3f];
+        *(curr++)=base64EncodeTable[(j    )&0x3f];
+        if (formatted) { if (!k) { *(curr++)=_CXML('\n'); k=18; } k--; }
+    }
+    eLen=inlen-eLen*3; // 0 - 2.
+    if (eLen==1)
+    {
+        *(curr++)=base64EncodeTable[ inbuf[0]>>2      ];
+        *(curr++)=base64EncodeTable[(inbuf[0]<<4)&0x3F];
+        *(curr++)=base64Fillchar;
+        *(curr++)=base64Fillchar;
+    } else if (eLen==2)
+    {
+        j=(inbuf[0]<<8)|inbuf[1];
+        *(curr++)=base64EncodeTable[ j>>10      ];
+        *(curr++)=base64EncodeTable[(j>> 4)&0x3f];
+        *(curr++)=base64EncodeTable[(j<< 2)&0x3f];
+        *(curr++)=base64Fillchar;
+    }
+    *(curr++)=0;
+    return (XMLSTR)buf;
+}
+
+unsigned int XMLParserBase64Tool::decodeSize(XMLCSTR data,XMLError *xe)
+{
+     if (xe) *xe=eXMLErrorNone;
+    int size=0;
+    unsigned char c;
+    //skip any extra characters (e.g. newlines or spaces)
+    while (*data)
+    {
+#ifdef _XMLWIDECHAR
+        if (*data>255) { if (xe) *xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; }
+#endif
+        c=base64DecodeTable[(unsigned char)(*data)];
+        if (c<97) size++;
+        else if (c==98) { if (xe) *xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; }
+        data++;
+    }
+    if (xe&&(size%4!=0)) *xe=eXMLErrorBase64DataSizeIsNotMultipleOf4;
+    if (size==0) return 0;
+    do { data--; size--; } while(*data==base64Fillchar); size++;
+    return (unsigned int)((size*3)/4);
+}
+
+unsigned char XMLParserBase64Tool::decode(XMLCSTR data, unsigned char *buf, int len, XMLError *xe)
+{
+    if (xe) *xe=eXMLErrorNone;
+    int i=0,p=0;
+    unsigned char d,c;
+    for(;;)
+    {
+
+#ifdef _XMLWIDECHAR
+#define BASE64DECODE_READ_NEXT_CHAR(c)                                              \
+        do {                                                                        \
+            if (data[i]>255){ c=98; break; }                                        \
+            c=base64DecodeTable[(unsigned char)data[i++]];                       \
+        }while (c==97);                                                             \
+        if(c==98){ if(xe)*xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; }
+#else
+#define BASE64DECODE_READ_NEXT_CHAR(c)                                           \
+        do { c=base64DecodeTable[(unsigned char)data[i++]]; }while (c==97);   \
+        if(c==98){ if(xe)*xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; }
+#endif
+
+        BASE64DECODE_READ_NEXT_CHAR(c)
+        if (c==99) { return 2; }
+        if (c==96)
+        {
+            if (p==(int)len) return 2;
+            if (xe) *xe=eXMLErrorBase64DecodeTruncatedData;
+            return 1;
+        }
+
+        BASE64DECODE_READ_NEXT_CHAR(d)
+        if ((d==99)||(d==96)) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData;  return 1; }
+        if (p==(int)len) {      if (xe) *xe=eXMLErrorBase64DecodeBufferTooSmall; return 0; }
+        buf[p++]=(unsigned char)((c<<2)|((d>>4)&0x3));
+
+        BASE64DECODE_READ_NEXT_CHAR(c)
+        if (c==99) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData;  return 1; }
+        if (p==(int)len)
+        {
+            if (c==96) return 2;
+            if (xe) *xe=eXMLErrorBase64DecodeBufferTooSmall;
+            return 0;
+        }
+        if (c==96) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData;  return 1; }
+        buf[p++]=(unsigned char)(((d<<4)&0xf0)|((c>>2)&0xf));
+
+        BASE64DECODE_READ_NEXT_CHAR(d)
+        if (d==99 ) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData;  return 1; }
+        if (p==(int)len)
+        {
+            if (d==96) return 2;
+            if (xe) *xe=eXMLErrorBase64DecodeBufferTooSmall;
+            return 0;
+        }
+        if (d==96) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData;  return 1; }
+        buf[p++]=(unsigned char)(((c<<6)&0xc0)|d);
+    }
+}
+#undef BASE64DECODE_READ_NEXT_CHAR
+
+void XMLParserBase64Tool::alloc(int newsize)
+{
+    if ((!buf)&&(newsize)) { buf=malloc(newsize); buflen=newsize; return; }
+    if (newsize>buflen) { buf=realloc(buf,newsize); buflen=newsize; }
+}
+
+unsigned char *XMLParserBase64Tool::decode(XMLCSTR data, int *outlen, XMLError *xe)
+{
+    if (xe) *xe=eXMLErrorNone;
+    unsigned int len=decodeSize(data,xe);
+    if (outlen) *outlen=len;
+    if (!len) return NULL;
+    alloc(len+1);
+    if(!decode(data,(unsigned char*)buf,len,xe)){ return NULL; }
+    return (unsigned char*)buf;
+}
+
diff --git a/ext/mcpat/xmlParser.h b/ext/mcpat/xmlParser.h
new file mode 100644 (file)
index 0000000..e29136c
--- /dev/null
@@ -0,0 +1,764 @@
+/****************************************************************************/
+/*! \mainpage XMLParser library
+ * \section intro_sec Introduction
+ *
+ * This is a basic XML parser written in ANSI C++ for portability.
+ * It works by using recursion and a node tree for breaking
+ * down the elements of an XML document.
+ *
+ * @version     V2.41
+ * @author      Frank Vanden Berghen
+ *
+ * The following license terms for the "XMLParser library from Business-Insight" apply to projects
+ * that are in some way related to
+ * the "mcpat project", including applications
+ * using "mcpat project" and tools developed
+ * for enhancing "mcpat project". All other projects
+ * (not related to "mcpat project") have to use the "XMLParser library from Business-Insight"
+ * code under the Aladdin Free Public License (AFPL)
+ * See the file "AFPL-license.txt" for more informations about the AFPL license.
+ * (see http://www.artifex.com/downloads/doc/Public.htm for detailed AFPL terms)
+ *
+ * Redistribution and use of the "XMLParser library from Business-Insight" in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Frank Vanden Berghen nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Business-Insight ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Business-Insight BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2002, Business-Insight
+ * <a href="http://www.Business-Insight.com">Business-Insight</a>
+ * All rights reserved.
+ *
+ * \section tutorial First Tutorial
+ * You can follow a simple <a href="../../xmlParser.html">Tutorial</a> to know the basics...
+ *
+ * \section usage General usage: How to include the XMLParser library inside your project.
+ *
+ * The library is composed of two files: <a href="../../xmlParser.cpp">xmlParser.cpp</a> and
+ * <a href="../../xmlParser.h">xmlParser.h</a>. These are the ONLY 2 files that you need when
+ * using the library inside your own projects.
+ *
+ * All the functions of the library are documented inside the comments of the file
+ * <a href="../../xmlParser.h">xmlParser.h</a>. These comments can be transformed in
+ * full-fledged HTML documentation using the DOXYGEN software: simply type: "doxygen doxy.cfg"
+ *
+ * By default, the XMLParser library uses (char*) for string representation.To use the (wchar_t*)
+ * version of the library, you need to define the "_UNICODE" preprocessor definition variable
+ * (this is usually done inside your project definition file) (This is done automatically for you
+ * when using Visual Studio).
+ *
+ * \section example Advanced Tutorial and Many Examples of usage.
+ *
+ * Some very small introductory examples are described inside the Tutorial file
+ * <a href="../../xmlParser.html">xmlParser.html</a>
+ *
+ * Some additional small examples are also inside the file <a href="../../xmlTest.cpp">xmlTest.cpp</a>
+ * (for the "char*" version of the library) and inside the file
+ * <a href="../../xmlTestUnicode.cpp">xmlTestUnicode.cpp</a> (for the "wchar_t*"
+ * version of the library). If you have a question, please review these additionnal examples
+ * before sending an e-mail to the author.
+ *
+ * To build the examples:
+ * - linux/unix: type "make"
+ * - solaris: type "make -f makefile.solaris"
+ * - windows: Visual Studio: double-click on xmlParser.dsw
+ *   (under Visual Studio .NET, the .dsp and .dsw files will be automatically converted to .vcproj and .sln files)
+ *
+ * In order to build the examples you need some additional files:
+ * - linux/unix: makefile
+ * - solaris: makefile.solaris
+ * - windows: Visual Studio: *.dsp, xmlParser.dsw and also xmlParser.lib and xmlParser.dll
+ *
+ * \section debugging Debugging with the XMLParser library
+ *
+ * \subsection debugwin Debugging under WINDOWS
+ *
+ *     Inside Visual C++, the "debug versions" of the memory allocation functions are
+ *     very slow: Do not forget to compile in "release mode" to get maximum speed.
+ *     When I had to debug a software that was using the XMLParser Library, it was usually
+ *     a nightmare because the library was sooOOOoooo slow in debug mode (because of the
+ *  slow memory allocations in Debug mode). To solve this
+ *     problem, during all the debugging session, I am now using a very fast DLL version of the
+ *     XMLParser Library (the DLL is compiled in release mode). Using the DLL version of
+ *     the XMLParser Library allows me to have lightening XML parsing speed even in debug!
+ *     Other than that, the DLL version is useless: In the release version of my tool,
+ *     I always use the normal, ".cpp"-based, XMLParser Library (I simply include the
+ * <a href="../../xmlParser.cpp">xmlParser.cpp</a> and
+ * <a href="../../xmlParser.h">xmlParser.h</a> files into the project).
+ *
+ *     The file <a href="../../XMLNodeAutoexp.txt">XMLNodeAutoexp.txt</a> contains some
+ * "tweaks" that improve substancially the display of the content of the XMLNode objects
+ * inside the Visual Studio Debugger. Believe me, once you have seen inside the debugger
+ * the "smooth" display of the XMLNode objects, you cannot live without it anymore!
+ *
+ * \subsection debuglinux Debugging under LINUX/UNIX
+ *
+ *     The speed of the debug version of the XMLParser library is tolerable so no extra
+ * work.has been done.
+ *
+ ****************************************************************************/
+
+#ifndef __INCLUDE_XML_NODE__
+#define __INCLUDE_XML_NODE__
+
+#include <stdlib.h>
+
+#ifdef _UNICODE
+// If you comment the next "define" line then the library will never "switch to" _UNICODE (wchar_t*) mode (16/32 bits per characters).
+// This is useful when you get error messages like:
+//    'XMLNode::openFileHelper' : cannot convert parameter 2 from 'const char [5]' to 'const wchar_t *'
+// The _XMLWIDECHAR preprocessor variable force the XMLParser library into either utf16/32-mode (the proprocessor variable
+// must be defined) or utf8-mode(the pre-processor variable must be undefined).
+#define _XMLWIDECHAR
+#endif
+
+#if defined(WIN32) || defined(UNDER_CE) || defined(_WIN32) || defined(WIN64) || defined(__BORLANDC__)
+// comment the next line if you are under windows and the compiler is not Microsoft Visual Studio (6.0 or .NET) or Borland
+#define _XMLWINDOWS
+#endif
+
+#ifdef XMLDLLENTRY
+#undef XMLDLLENTRY
+#endif
+#ifdef _USE_XMLPARSER_DLL
+#ifdef _DLL_EXPORTS_
+#define XMLDLLENTRY __declspec(dllexport)
+#else
+#define XMLDLLENTRY __declspec(dllimport)
+#endif
+#else
+#define XMLDLLENTRY
+#endif
+
+// uncomment the next line if you want no support for wchar_t* (no need for the <wchar.h> or <tchar.h> libraries anymore to compile)
+//#define XML_NO_WIDE_CHAR
+
+#ifdef XML_NO_WIDE_CHAR
+#undef _XMLWINDOWS
+#undef _XMLWIDECHAR
+#endif
+
+#ifdef _XMLWINDOWS
+#include <tchar.h>
+#else
+#define XMLDLLENTRY
+#ifndef XML_NO_WIDE_CHAR
+#include <wchar.h> // to have 'wcsrtombs' for ANSI version
+                   // to have 'mbsrtowcs' for WIDECHAR version
+#endif
+#endif
+
+// Some common types for char set portable code
+#ifdef _XMLWIDECHAR
+    #define _CXML(c) L ## c
+    #define XMLCSTR const wchar_t *
+    #define XMLSTR  wchar_t *
+    #define XMLCHAR wchar_t
+#else
+    #define _CXML(c) c
+    #define XMLCSTR const char *
+    #define XMLSTR  char *
+    #define XMLCHAR char
+#endif
+#ifndef FALSE
+    #define FALSE 0
+#endif /* FALSE */
+#ifndef TRUE
+    #define TRUE 1
+#endif /* TRUE */
+
+
+/// Enumeration for XML parse errors.
+typedef enum XMLError
+{
+    eXMLErrorNone = 0,
+    eXMLErrorMissingEndTag,
+    eXMLErrorNoXMLTagFound,
+    eXMLErrorEmpty,
+    eXMLErrorMissingTagName,
+    eXMLErrorMissingEndTagName,
+    eXMLErrorUnmatchedEndTag,
+    eXMLErrorUnmatchedEndClearTag,
+    eXMLErrorUnexpectedToken,
+    eXMLErrorNoElements,
+    eXMLErrorFileNotFound,
+    eXMLErrorFirstTagNotFound,
+    eXMLErrorUnknownCharacterEntity,
+    eXMLErrorCharacterCodeAbove255,
+    eXMLErrorCharConversionError,
+    eXMLErrorCannotOpenWriteFile,
+    eXMLErrorCannotWriteFile,
+
+    eXMLErrorBase64DataSizeIsNotMultipleOf4,
+    eXMLErrorBase64DecodeIllegalCharacter,
+    eXMLErrorBase64DecodeTruncatedData,
+    eXMLErrorBase64DecodeBufferTooSmall
+} XMLError;
+
+
+/// Enumeration used to manage type of data. Use in conjunction with structure XMLNodeContents
+typedef enum XMLElementType
+{
+    eNodeChild=0,
+    eNodeAttribute=1,
+    eNodeText=2,
+    eNodeClear=3,
+    eNodeNULL=4
+} XMLElementType;
+
+/// Structure used to obtain error details if the parse fails.
+typedef struct XMLResults
+{
+    enum XMLError error;
+    int  nLine,nColumn;
+} XMLResults;
+
+/// Structure for XML clear (unformatted) node (usually comments)
+typedef struct XMLClear {
+    XMLCSTR lpszValue; XMLCSTR lpszOpenTag; XMLCSTR lpszCloseTag;
+} XMLClear;
+
+/// Structure for XML attribute.
+typedef struct XMLAttribute {
+    XMLCSTR lpszName; XMLCSTR lpszValue;
+} XMLAttribute;
+
+/// XMLElementPosition are not interchangeable with simple indexes
+typedef int XMLElementPosition;
+
+struct XMLNodeContents;
+
+/** @defgroup XMLParserGeneral The XML parser */
+
+/// Main Class representing a XML node
+/**
+ * All operations are performed using this class.
+ * \note The constructors of the XMLNode class are protected, so use instead one of these four methods to get your first instance of XMLNode:
+ * <ul>
+ *    <li> XMLNode::parseString </li>
+ *    <li> XMLNode::parseFile </li>
+ *    <li> XMLNode::openFileHelper </li>
+ *    <li> XMLNode::createXMLTopNode (or XMLNode::createXMLTopNode_WOSD)</li>
+ * </ul> */
+typedef struct XMLDLLENTRY XMLNode
+{
+  private:
+
+    struct XMLNodeDataTag;
+
+    /// Constructors are protected, so use instead one of: XMLNode::parseString, XMLNode::parseFile, XMLNode::openFileHelper, XMLNode::createXMLTopNode
+    XMLNode(struct XMLNodeDataTag *pParent, XMLSTR lpszName, char isDeclaration);
+    /// Constructors are protected, so use instead one of: XMLNode::parseString, XMLNode::parseFile, XMLNode::openFileHelper, XMLNode::createXMLTopNode
+    XMLNode(struct XMLNodeDataTag *p);
+
+  public:
+    static XMLCSTR getVersion();///< Return the XMLParser library version number
+
+    /** @defgroup conversions Parsing XML files/strings to an XMLNode structure and Rendering XMLNode's to files/string.
+     * @ingroup XMLParserGeneral
+     * @{ */
+
+    /// Parse an XML string and return the root of a XMLNode tree representing the string.
+    static XMLNode parseString   (XMLCSTR  lpXMLString, XMLCSTR tag=NULL, XMLResults *pResults=NULL);
+    /**< The "parseString" function parse an XML string and return the root of a XMLNode tree. The "opposite" of this function is
+     * the function "createXMLString" that re-creates an XML string from an XMLNode tree. If the XML document is corrupted, the
+     * "parseString" method will initialize the "pResults" variable with some information that can be used to trace the error.
+     * If you still want to parse the file, you can use the APPROXIMATE_PARSING option as explained inside the note at the
+     * beginning of the "xmlParser.cpp" file.
+     *
+     * @param lpXMLString the XML string to parse
+     * @param tag  the name of the first tag inside the XML file. If the tag parameter is omitted, this function returns a node that represents the head of the xml document including the declaration term (<? ... ?>).
+     * @param pResults a pointer to a XMLResults variable that will contain some information that can be used to trace the XML parsing error. You can have a user-friendly explanation of the parsing error with the "getError" function.
+     */
+
+    /// Parse an XML file and return the root of a XMLNode tree representing the file.
+    static XMLNode parseFile     (XMLCSTR     filename, XMLCSTR tag=NULL, XMLResults *pResults=NULL);
+    /**< The "parseFile" function parse an XML file and return the root of a XMLNode tree. The "opposite" of this function is
+     * the function "writeToFile" that re-creates an XML file from an XMLNode tree. If the XML document is corrupted, the
+     * "parseFile" method will initialize the "pResults" variable with some information that can be used to trace the error.
+     * If you still want to parse the file, you can use the APPROXIMATE_PARSING option as explained inside the note at the
+     * beginning of the "xmlParser.cpp" file.
+     *
+     * @param filename the path to the XML file to parse
+     * @param tag the name of the first tag inside the XML file. If the tag parameter is omitted, this function returns a node that represents the head of the xml document including the declaration term (<? ... ?>).
+     * @param pResults a pointer to a XMLResults variable that will contain some information that can be used to trace the XML parsing error. You can have a user-friendly explanation of the parsing error with the "getError" function.
+     */
+
+    /// Parse an XML file and return the root of a XMLNode tree representing the file. A very crude error checking is made. An attempt to guess the Char Encoding used in the file is made.
+    static XMLNode openFileHelper(XMLCSTR     filename, XMLCSTR tag=NULL);
+    /**< The "openFileHelper" function reports to the screen all the warnings and errors that occurred during parsing of the XML file.
+     * This function also tries to guess char Encoding (UTF-8, ASCII or SHIT-JIS) based on the first 200 bytes of the file. Since each
+     * application has its own way to report and deal with errors, you should rather use the "parseFile" function to parse XML files
+     * and program yourself thereafter an "error reporting" tailored for your needs (instead of using the very crude "error reporting"
+     * mechanism included inside the "openFileHelper" function).
+     *
+     * If the XML document is corrupted, the "openFileHelper" method will:
+     *         - display an error message on the console (or inside a messageBox for windows).
+     *         - stop execution (exit).
+     *
+     * I strongly suggest that you write your own "openFileHelper" method tailored to your needs. If you still want to parse
+     * the file, you can use the APPROXIMATE_PARSING option as explained inside the note at the beginning of the "xmlParser.cpp" file.
+     *
+     * @param filename the path of the XML file to parse.
+     * @param tag the name of the first tag inside the XML file. If the tag parameter is omitted, this function returns a node that represents the head of the xml document including the declaration term (<? ... ?>).
+     */
+
+    static XMLCSTR getError(XMLError error); ///< this gives you a user-friendly explanation of the parsing error
+
+    /// Create an XML string starting from the current XMLNode.
+    XMLSTR createXMLString(int nFormat=1, int *pnSize=NULL) const;
+    /**< The returned string should be free'd using the "freeXMLString" function.
+     *
+     *   If nFormat==0, no formatting is required otherwise this returns an user friendly XML string from a given element
+     *   with appropriate white spaces and carriage returns. if pnSize is given it returns the size in character of the string. */
+
+    /// Save the content of an xmlNode inside a file
+    XMLError writeToFile(XMLCSTR filename,
+                         const char *encoding=NULL,
+                         char nFormat=1) const;
+    /**< If nFormat==0, no formatting is required otherwise this returns an user friendly XML string from a given element with appropriate white spaces and carriage returns.
+     * If the global parameter "characterEncoding==encoding_UTF8", then the "encoding" parameter is ignored and always set to "utf-8".
+     * If the global parameter "characterEncoding==encoding_ShiftJIS", then the "encoding" parameter is ignored and always set to "SHIFT-JIS".
+     * If "_XMLWIDECHAR=1", then the "encoding" parameter is ignored and always set to "utf-16".
+     * If no "encoding" parameter is given the "ISO-8859-1" encoding is used. */
+    /** @} */
+
+    /** @defgroup navigate Navigate the XMLNode structure
+     * @ingroup XMLParserGeneral
+     * @{ */
+    XMLCSTR getName() const;                                       ///< name of the node
+    XMLCSTR getText(int i=0) const;                                ///< return ith text field
+    int nText() const;                                             ///< nbr of text field
+    XMLNode getParentNode() const;                                 ///< return the parent node
+    XMLNode getChildNode(int i=0) const;                           ///< return ith child node
+    XMLNode getChildNode(XMLCSTR name, int i)  const;              ///< return ith child node with specific name (return an empty node if failing). If i==-1, this returns the last XMLNode with the given name.
+    XMLNode getChildNode(XMLCSTR name, int *i=NULL) const;         ///< return next child node with specific name (return an empty node if failing)
+    XMLNode getChildNodeWithAttribute(XMLCSTR tagName,
+                                      XMLCSTR attributeName,
+                                      XMLCSTR attributeValue=NULL,
+                                      int *i=NULL)  const;         ///< return child node with specific name/attribute (return an empty node if failing)
+    XMLNode getChildNodeByPath(XMLCSTR path, char createNodeIfMissing=0, XMLCHAR sep='/');
+                                                                   ///< return the first child node with specific path
+    XMLNode getChildNodeByPathNonConst(XMLSTR  path, char createNodeIfMissing=0, XMLCHAR sep='/');
+                                                                   ///< return the first child node with specific path.
+
+    int nChildNode(XMLCSTR name) const;                            ///< return the number of child node with specific name
+    int nChildNode() const;                                        ///< nbr of child node
+    XMLAttribute getAttribute(int i=0) const;                      ///< return ith attribute
+    XMLCSTR      getAttributeName(int i=0) const;                  ///< return ith attribute name
+    XMLCSTR      getAttributeValue(int i=0) const;                 ///< return ith attribute value
+    char  isAttributeSet(XMLCSTR name) const;                      ///< test if an attribute with a specific name is given
+    XMLCSTR getAttribute(XMLCSTR name, int i) const;               ///< return ith attribute content with specific name (return a NULL if failing)
+    XMLCSTR getAttribute(XMLCSTR name, int *i=NULL) const;         ///< return next attribute content with specific name (return a NULL if failing)
+    int nAttribute() const;                                        ///< nbr of attribute
+    XMLClear getClear(int i=0) const;                              ///< return ith clear field (comments)
+    int nClear() const;                                            ///< nbr of clear field
+    XMLNodeContents enumContents(XMLElementPosition i) const;      ///< enumerate all the different contents (attribute,child,text, clear) of the current XMLNode. The order is reflecting the order of the original file/string. NOTE: 0 <= i < nElement();
+    int nElement() const;                                          ///< nbr of different contents for current node
+    char isEmpty() const;                                          ///< is this node Empty?
+    char isDeclaration() const;                                    ///< is this node a declaration <? .... ?>
+    XMLNode deepCopy() const;                                      ///< deep copy (duplicate/clone) a XMLNode
+    static XMLNode emptyNode();                                    ///< return XMLNode::emptyXMLNode;
+    /** @} */
+
+    ~XMLNode();
+    XMLNode(const XMLNode &A);                                     ///< to allow shallow/fast copy:
+    XMLNode& operator=( const XMLNode& A );                        ///< to allow shallow/fast copy:
+
+    XMLNode(): d(NULL){};
+    static XMLNode emptyXMLNode;
+    static XMLClear emptyXMLClear;
+    static XMLAttribute emptyXMLAttribute;
+
+    /** @defgroup xmlModify Create or Update the XMLNode structure
+     * @ingroup XMLParserGeneral
+     *  The functions in this group allows you to create from scratch (or update) a XMLNode structure. Start by creating your top
+     *  node with the "createXMLTopNode" function and then add new nodes with the "addChild" function. The parameter 'pos' gives
+     *  the position where the childNode, the text or the XMLClearTag will be inserted. The default value (pos=-1) inserts at the
+     *  end. The value (pos=0) insert at the beginning (Insertion at the beginning is slower than at the end). <br>
+     *
+     *  REMARK: 0 <= pos < nChild()+nText()+nClear() <br>
+     */
+
+    /** @defgroup creation Creating from scratch a XMLNode structure
+     * @ingroup xmlModify
+     * @{ */
+    static XMLNode createXMLTopNode(XMLCSTR lpszName, char isDeclaration=FALSE);                    ///< Create the top node of an XMLNode structure
+    XMLNode        addChild(XMLCSTR lpszName, char isDeclaration=FALSE, XMLElementPosition pos=-1); ///< Add a new child node
+    XMLNode        addChild(XMLNode nodeToAdd, XMLElementPosition pos=-1);                          ///< If the "nodeToAdd" has some parents, it will be detached from it's parents before being attached to the current XMLNode
+    XMLAttribute  *addAttribute(XMLCSTR lpszName, XMLCSTR lpszValuev);                              ///< Add a new attribute
+    XMLCSTR        addText(XMLCSTR lpszValue, XMLElementPosition pos=-1);                           ///< Add a new text content
+    XMLClear      *addClear(XMLCSTR lpszValue, XMLCSTR lpszOpen=NULL, XMLCSTR lpszClose=NULL, XMLElementPosition pos=-1);
+    /**< Add a new clear tag
+     * @param lpszOpen default value "<![CDATA["
+     * @param lpszClose default value "]]>"
+     */
+    /** @} */
+
+    /** @defgroup xmlUpdate Updating Nodes
+     * @ingroup xmlModify
+     * Some update functions:
+     * @{
+     */
+    XMLCSTR       updateName(XMLCSTR lpszName);                                                  ///< change node's name
+    XMLAttribute *updateAttribute(XMLAttribute *newAttribute, XMLAttribute *oldAttribute);       ///< if the attribute to update is missing, a new one will be added
+    XMLAttribute *updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName=NULL,int i=0);       ///< if the attribute to update is missing, a new one will be added
+    XMLAttribute *updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName,XMLCSTR lpszOldName);///< set lpszNewName=NULL if you don't want to change the name of the attribute if the attribute to update is missing, a new one will be added
+    XMLCSTR       updateText(XMLCSTR lpszNewValue, int i=0);                                     ///< if the text to update is missing, a new one will be added
+    XMLCSTR       updateText(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue);                        ///< if the text to update is missing, a new one will be added
+    XMLClear     *updateClear(XMLCSTR lpszNewContent, int i=0);                                  ///< if the clearTag to update is missing, a new one will be added
+    XMLClear     *updateClear(XMLClear *newP,XMLClear *oldP);                                    ///< if the clearTag to update is missing, a new one will be added
+    XMLClear     *updateClear(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue);                       ///< if the clearTag to update is missing, a new one will be added
+    /** @} */
+
+    /** @defgroup xmlDelete Deleting Nodes or Attributes
+     * @ingroup xmlModify
+     * Some deletion functions:
+     * @{
+     */
+    /// The "deleteNodeContent" function forces the deletion of the content of this XMLNode and the subtree.
+    void deleteNodeContent();
+    /**< \note The XMLNode instances that are referring to the part of the subtree that has been deleted CANNOT be used anymore!!. Unexpected results will occur if you continue using them. */
+    void deleteAttribute(int i=0);                   ///< Delete the ith attribute of the current XMLNode
+    void deleteAttribute(XMLCSTR lpszName);          ///< Delete the attribute with the given name (the "strcmp" function is used to find the right attribute)
+    void deleteAttribute(XMLAttribute *anAttribute); ///< Delete the attribute with the name "anAttribute->lpszName" (the "strcmp" function is used to find the right attribute)
+    void deleteText(int i=0);                        ///< Delete the Ith text content of the current XMLNode
+    void deleteText(XMLCSTR lpszValue);              ///< Delete the text content "lpszValue" inside the current XMLNode (direct "pointer-to-pointer" comparison is used to find the right text)
+    void deleteClear(int i=0);                       ///< Delete the Ith clear tag inside the current XMLNode
+    void deleteClear(XMLCSTR lpszValue);             ///< Delete the clear tag "lpszValue" inside the current XMLNode (direct "pointer-to-pointer" comparison is used to find the clear tag)
+    void deleteClear(XMLClear *p);                   ///< Delete the clear tag "p" inside the current XMLNode (direct "pointer-to-pointer" comparison on the lpszName of the clear tag is used to find the clear tag)
+    /** @} */
+
+    /** @defgroup xmlWOSD ???_WOSD functions.
+     * @ingroup xmlModify
+     *  The strings given as parameters for the "add" and "update" methods that have a name with
+     *  the postfix "_WOSD" (that means "WithOut String Duplication")(for example "addText_WOSD")
+     *  will be free'd by the XMLNode class. For example, it means that this is incorrect:
+     *  \code
+     *     xNode.addText_WOSD("foo");
+     *     xNode.updateAttribute_WOSD("#newcolor" ,NULL,"color");
+     *  \endcode
+     *  In opposition, this is correct:
+     *  \code
+     *     xNode.addText("foo");
+     *     xNode.addText_WOSD(stringDup("foo"));
+     *     xNode.updateAttribute("#newcolor" ,NULL,"color");
+     *     xNode.updateAttribute_WOSD(stringDup("#newcolor"),NULL,"color");
+     *  \endcode
+     *  Typically, you will never do:
+     *  \code
+     *     char *b=(char*)malloc(...);
+     *     xNode.addText(b);
+     *     free(b);
+     *  \endcode
+     *  ... but rather:
+     *  \code
+     *     char *b=(char*)malloc(...);
+     *     xNode.addText_WOSD(b);
+     *  \endcode
+     *  ('free(b)' is performed by the XMLNode class)
+     * @{ */
+    static XMLNode createXMLTopNode_WOSD(XMLSTR lpszName, char isDeclaration=FALSE);                     ///< Create the top node of an XMLNode structure
+    XMLNode        addChild_WOSD(XMLSTR lpszName, char isDeclaration=FALSE, XMLElementPosition pos=-1);  ///< Add a new child node
+    XMLAttribute  *addAttribute_WOSD(XMLSTR lpszName, XMLSTR lpszValue);                                 ///< Add a new attribute
+    XMLCSTR        addText_WOSD(XMLSTR lpszValue, XMLElementPosition pos=-1);                            ///< Add a new text content
+    XMLClear      *addClear_WOSD(XMLSTR lpszValue, XMLCSTR lpszOpen=NULL, XMLCSTR lpszClose=NULL, XMLElementPosition pos=-1); ///< Add a new clear Tag
+
+    XMLCSTR        updateName_WOSD(XMLSTR lpszName);                                                  ///< change node's name
+    XMLAttribute  *updateAttribute_WOSD(XMLAttribute *newAttribute, XMLAttribute *oldAttribute);      ///< if the attribute to update is missing, a new one will be added
+    XMLAttribute  *updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName=NULL,int i=0);        ///< if the attribute to update is missing, a new one will be added
+    XMLAttribute  *updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName,XMLCSTR lpszOldName); ///< set lpszNewName=NULL if you don't want to change the name of the attribute if the attribute to update is missing, a new one will be added
+    XMLCSTR        updateText_WOSD(XMLSTR lpszNewValue, int i=0);                                     ///< if the text to update is missing, a new one will be added
+    XMLCSTR        updateText_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue);                        ///< if the text to update is missing, a new one will be added
+    XMLClear      *updateClear_WOSD(XMLSTR lpszNewContent, int i=0);                                  ///< if the clearTag to update is missing, a new one will be added
+    XMLClear      *updateClear_WOSD(XMLClear *newP,XMLClear *oldP);                                   ///< if the clearTag to update is missing, a new one will be added
+    XMLClear      *updateClear_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue);                       ///< if the clearTag to update is missing, a new one will be added
+    /** @} */
+
+    /** @defgroup xmlPosition Position helper functions (use in conjunction with the update&add functions
+     * @ingroup xmlModify
+     * These are some useful functions when you want to insert a childNode, a text or a XMLClearTag in the
+     * middle (at a specified position) of a XMLNode tree already constructed. The value returned by these
+     * methods is to be used as last parameter (parameter 'pos') of addChild, addText or addClear.
+     * @{ */
+    XMLElementPosition positionOfText(int i=0) const;
+    XMLElementPosition positionOfText(XMLCSTR lpszValue) const;
+    XMLElementPosition positionOfClear(int i=0) const;
+    XMLElementPosition positionOfClear(XMLCSTR lpszValue) const;
+    XMLElementPosition positionOfClear(XMLClear *a) const;
+    XMLElementPosition positionOfChildNode(int i=0) const;
+    XMLElementPosition positionOfChildNode(XMLNode x) const;
+    XMLElementPosition positionOfChildNode(XMLCSTR name, int i=0) const; ///< return the position of the ith childNode with the specified name if (name==NULL) return the position of the ith childNode
+    /** @} */
+
+    /// Enumeration for XML character encoding.
+    typedef enum XMLCharEncoding
+    {
+        char_encoding_error=0,
+        char_encoding_UTF8=1,
+        char_encoding_legacy=2,
+        char_encoding_ShiftJIS=3,
+        char_encoding_GB2312=4,
+        char_encoding_Big5=5,
+        char_encoding_GBK=6     // this is actually the same as Big5
+    } XMLCharEncoding;
+
+    /** \addtogroup conversions
+     * @{ */
+
+    /// Sets the global options for the conversions
+    static char setGlobalOptions(XMLCharEncoding characterEncoding=XMLNode::char_encoding_UTF8, char guessWideCharChars=1,
+                                 char dropWhiteSpace=1, char removeCommentsInMiddleOfText=1);
+    /**< The "setGlobalOptions" function allows you to change four global parameters that affect string & file
+     * parsing. First of all, you most-probably will never have to change these 3 global parameters.
+     *
+     * @param guessWideCharChars If "guessWideCharChars"=1 and if this library is compiled in WideChar mode, then the
+     *     XMLNode::parseFile and XMLNode::openFileHelper functions will test if the file contains ASCII
+     *     characters. If this is the case, then the file will be loaded and converted in memory to
+     *     WideChar before being parsed. If 0, no conversion will be performed.
+     *
+     * @param guessWideCharChars If "guessWideCharChars"=1 and if this library is compiled in ASCII/UTF8/char* mode, then the
+     *     XMLNode::parseFile and XMLNode::openFileHelper functions will test if the file contains WideChar
+     *     characters. If this is the case, then the file will be loaded and converted in memory to
+     *     ASCII/UTF8/char* before being parsed. If 0, no conversion will be performed.
+     *
+     * @param characterEncoding This parameter is only meaningful when compiling in char* mode (multibyte character mode).
+     *     In wchar_t* (wide char mode), this parameter is ignored. This parameter should be one of the
+     *     three currently recognized encodings: XMLNode::encoding_UTF8, XMLNode::encoding_ascii,
+     *     XMLNode::encoding_ShiftJIS.
+     *
+     * @param dropWhiteSpace In most situations, text fields containing only white spaces (and carriage returns)
+     *     are useless. Even more, these "empty" text fields are annoying because they increase the
+     *     complexity of the user's code for parsing. So, 99% of the time, it's better to drop
+     *     the "empty" text fields. However The XML specification indicates that no white spaces
+     *     should be lost when parsing the file. So to be perfectly XML-compliant, you should set
+     *     dropWhiteSpace=0. A note of caution: if you set "dropWhiteSpace=0", the parser will be
+     *     slower and your code will be more complex.
+     *
+     * @param removeCommentsInMiddleOfText To explain this parameter, let's consider this code:
+     * \code
+     *        XMLNode x=XMLNode::parseString("<a>foo<!-- hello -->bar<!DOCTYPE world >chu</a>","a");
+     * \endcode
+     *     If removeCommentsInMiddleOfText=0, then we will have:
+     * \code
+     *        x.getText(0) -> "foo"
+     *        x.getText(1) -> "bar"
+     *        x.getText(2) -> "chu"
+     *        x.getClear(0) --> "<!-- hello -->"
+     *        x.getClear(1) --> "<!DOCTYPE world >"
+     * \endcode
+     *     If removeCommentsInMiddleOfText=1, then we will have:
+     * \code
+     *        x.getText(0) -> "foobar"
+     *        x.getText(1) -> "chu"
+     *        x.getClear(0) --> "<!DOCTYPE world >"
+     * \endcode
+     *
+     * \return "0" when there are no errors. If you try to set an unrecognized encoding then the return value will be "1" to signal an error.
+     *
+     * \note Sometime, it's useful to set "guessWideCharChars=0" to disable any conversion
+     * because the test to detect the file-type (ASCII/UTF8/char* or WideChar) may fail (rarely). */
+
+    /// Guess the character encoding of the string (ascii, utf8 or shift-JIS)
+    static XMLCharEncoding guessCharEncoding(void *buffer, int bufLen, char useXMLEncodingAttribute=1);
+    /**< The "guessCharEncoding" function try to guess the character encoding. You most-probably will never
+     * have to use this function. It then returns the appropriate value of the global parameter
+     * "characterEncoding" described in the XMLNode::setGlobalOptions. The guess is based on the content of a buffer of length
+     * "bufLen" bytes that contains the first bytes (minimum 25 bytes; 200 bytes is a good value) of the
+     * file to be parsed. The XMLNode::openFileHelper function is using this function to automatically compute
+     * the value of the "characterEncoding" global parameter. There are several heuristics used to do the
+     * guess. One of the heuristic is based on the "encoding" attribute. The original XML specifications
+     * forbids to use this attribute to do the guess but you can still use it if you set
+     * "useXMLEncodingAttribute" to 1 (this is the default behavior and the behavior of most parsers).
+     * If an inconsistency in the encoding is detected, then the return value is "0". */
+    /** @} */
+
+  private:
+      // these are functions and structures used internally by the XMLNode class (don't bother about them):
+
+      typedef struct XMLNodeDataTag // to allow shallow copy and "intelligent/smart" pointers (automatic delete):
+      {
+          XMLCSTR                lpszName;        // Element name (=NULL if root)
+          int                    nChild,          // Number of child nodes
+                                 nText,           // Number of text fields
+                                 nClear,          // Number of Clear fields (comments)
+                                 nAttribute;      // Number of attributes
+          char                   isDeclaration;   // Whether node is an XML declaration - '<?xml ?>'
+          struct XMLNodeDataTag  *pParent;        // Pointer to parent element (=NULL if root)
+          XMLNode                *pChild;         // Array of child nodes
+          XMLCSTR                *pText;          // Array of text fields
+          XMLClear               *pClear;         // Array of clear fields
+          XMLAttribute           *pAttribute;     // Array of attributes
+          int                    *pOrder;         // order of the child_nodes,text_fields,clear_fields
+          int                    ref_count;       // for garbage collection (smart pointers)
+      } XMLNodeData;
+      XMLNodeData *d;
+
+      char parseClearTag(void *px, void *pa);
+      char maybeAddTxT(void *pa, XMLCSTR tokenPStr);
+      int ParseXMLElement(void *pXML);
+      void *addToOrder(int memInc, int *_pos, int nc, void *p, int size, XMLElementType xtype);
+      int indexText(XMLCSTR lpszValue) const;
+      int indexClear(XMLCSTR lpszValue) const;
+      XMLNode addChild_priv(int,XMLSTR,char,int);
+      XMLAttribute *addAttribute_priv(int,XMLSTR,XMLSTR);
+      XMLCSTR addText_priv(int,XMLSTR,int);
+      XMLClear *addClear_priv(int,XMLSTR,XMLCSTR,XMLCSTR,int);
+      void emptyTheNode(char force);
+      static inline XMLElementPosition findPosition(XMLNodeData *d, int index, XMLElementType xtype);
+      static int CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker, int nFormat);
+      static int removeOrderElement(XMLNodeData *d, XMLElementType t, int index);
+      static void exactMemory(XMLNodeData *d);
+      static int detachFromParent(XMLNodeData *d);
+} XMLNode;
+
+/// This structure is given by the function XMLNode::enumContents.
+typedef struct XMLNodeContents
+{
+    /// This dictates what's the content of the XMLNodeContent
+    enum XMLElementType etype;
+    /**< should be an union to access the appropriate data. Compiler does not allow union of object with constructor... too bad. */
+    XMLNode child;
+    XMLAttribute attrib;
+    XMLCSTR text;
+    XMLClear clear;
+
+} XMLNodeContents;
+
+/** @defgroup StringAlloc String Allocation/Free functions
+ * @ingroup xmlModify
+ * @{ */
+/// Duplicate (copy in a new allocated buffer) the source string.
+XMLDLLENTRY XMLSTR stringDup(XMLCSTR source, int cbData=-1);
+/**< This is
+ * a very handy function when used with all the "XMLNode::*_WOSD" functions (\link xmlWOSD \endlink).
+ * @param cbData If !=0 then cbData is the number of chars to duplicate. New strings allocated with
+ * this function should be free'd using the "freeXMLString" function. */
+
+/// to free the string allocated inside the "stringDup" function or the "createXMLString" function.
+XMLDLLENTRY void freeXMLString(XMLSTR t); // {free(t);}
+/** @} */
+
+/** @defgroup atoX ato? like functions
+ * @ingroup XMLParserGeneral
+ * The "xmlto?" functions are equivalents to the atoi, atol, atof functions.
+ * The only difference is: If the variable "xmlString" is NULL, than the return value
+ * is "defautValue". These 6 functions are only here as "convenience" functions for the
+ * user (they are not used inside the XMLparser). If you don't need them, you can
+ * delete them without any trouble.
+ *
+ * @{ */
+XMLDLLENTRY char    xmltob(XMLCSTR xmlString,char   defautValue=0);
+XMLDLLENTRY int     xmltoi(XMLCSTR xmlString,int    defautValue=0);
+XMLDLLENTRY long    xmltol(XMLCSTR xmlString,long   defautValue=0);
+XMLDLLENTRY double  xmltof(XMLCSTR xmlString,double defautValue=.0);
+XMLDLLENTRY XMLCSTR xmltoa(XMLCSTR xmlString,XMLCSTR defautValue=_CXML(""));
+XMLDLLENTRY XMLCHAR xmltoc(XMLCSTR xmlString,XMLCHAR defautValue=_CXML('\0'));
+/** @} */
+
+/** @defgroup ToXMLStringTool Helper class to create XML files using "printf", "fprintf", "cout",... functions.
+ * @ingroup XMLParserGeneral
+ * @{ */
+/// Helper class to create XML files using "printf", "fprintf", "cout",... functions.
+/** The ToXMLStringTool class helps you creating XML files using "printf", "fprintf", "cout",... functions.
+ * The "ToXMLStringTool" class is processing strings so that all the characters
+ * &,",',<,> are replaced by their XML equivalent:
+ * \verbatim &amp;, &quot;, &apos;, &lt;, &gt; \endverbatim
+ * Using the "ToXMLStringTool class" and the "fprintf function" is THE most efficient
+ * way to produce VERY large XML documents VERY fast.
+ * \note If you are creating from scratch an XML file using the provided XMLNode class
+ * you must not use the "ToXMLStringTool" class (because the "XMLNode" class does the
+ * processing job for you during rendering).*/
+typedef struct XMLDLLENTRY ToXMLStringTool
+{
+public:
+    ToXMLStringTool(): buf(NULL),buflen(0){}
+    ~ToXMLStringTool();
+    void freeBuffer();///<call this function when you have finished using this object to release memory used by the internal buffer.
+
+    XMLSTR toXML(XMLCSTR source);///< returns a pointer to an internal buffer that contains a XML-encoded string based on the "source" parameter.
+
+    /** The "toXMLUnSafe" function is deprecated because there is a possibility of
+     * "destination-buffer-overflow". It converts the string
+     * "source" to the string "dest". */
+    static XMLSTR toXMLUnSafe(XMLSTR dest,XMLCSTR source); ///< deprecated: use "toXML" instead
+    static int lengthXMLString(XMLCSTR source);            ///< deprecated: use "toXML" instead
+
+private:
+    XMLSTR buf;
+    int buflen;
+} ToXMLStringTool;
+/** @} */
+
+/** @defgroup XMLParserBase64Tool Helper class to include binary data inside XML strings using "Base64 encoding".
+ * @ingroup XMLParserGeneral
+ * @{ */
+/// Helper class to include binary data inside XML strings using "Base64 encoding".
+/** The "XMLParserBase64Tool" class allows you to include any binary data (images, sounds,...)
+ * into an XML document using "Base64 encoding". This class is completely
+ * separated from the rest of the xmlParser library and can be removed without any problem.
+ * To include some binary data into an XML file, you must convert the binary data into
+ * standard text (using "encode"). To retrieve the original binary data from the
+ * b64-encoded text included inside the XML file, use "decode". Alternatively, these
+ * functions can also be used to "encrypt/decrypt" some critical data contained inside
+ * the XML (it's not a strong encryption at all, but sometimes it can be useful). */
+typedef struct XMLDLLENTRY XMLParserBase64Tool
+{
+public:
+    XMLParserBase64Tool(): buf(NULL),buflen(0){}
+    ~XMLParserBase64Tool();
+    void freeBuffer();///< Call this function when you have finished using this object to release memory used by the internal buffer.
+
+    /**
+     * @param formatted If "formatted"=true, some space will be reserved for a carriage-return every 72 chars. */
+    static int encodeLength(int inBufLen, char formatted=0); ///< return the length of the base64 string that encodes a data buffer of size inBufLen bytes.
+
+    /**
+     * The "base64Encode" function returns a string containing the base64 encoding of "inByteLen" bytes
+     * from "inByteBuf". If "formatted" parameter is true, then there will be a carriage-return every 72 chars.
+     * The string will be free'd when the XMLParserBase64Tool object is deleted.
+     * All returned strings are sharing the same memory space. */
+    XMLSTR encode(unsigned char *inByteBuf, unsigned int inByteLen, char formatted=0); ///< returns a pointer to an internal buffer containing the base64 string containing the binary data encoded from "inByteBuf"
+
+    /// returns the number of bytes which will be decoded from "inString".
+    static unsigned int decodeSize(XMLCSTR inString, XMLError *xe=NULL);
+
+    /**
+     * The "decode" function returns a pointer to a buffer containing the binary data decoded from "inString"
+     * The output buffer will be free'd when the XMLParserBase64Tool object is deleted.
+     * All output buffer are sharing the same memory space.
+     * @param inString If "instring" is malformed, NULL will be returned */
+    unsigned char* decode(XMLCSTR inString, int *outByteLen=NULL, XMLError *xe=NULL); ///< returns a pointer to an internal buffer containing the binary data decoded from "inString"
+
+    /**
+     * decodes data from "inString" to "outByteBuf". You need to provide the size (in byte) of "outByteBuf"
+     * in "inMaxByteOutBuflen". If "outByteBuf" is not large enough or if data is malformed, then "FALSE"
+     * will be returned; otherwise "TRUE". */
+    static unsigned char decode(XMLCSTR inString, unsigned char *outByteBuf, int inMaxByteOutBuflen, XMLError *xe=NULL); ///< deprecated.
+
+private:
+    void *buf;
+    int buflen;
+    void alloc(int newsize);
+}XMLParserBase64Tool;
+/** @} */
+
+#undef XMLDLLENTRY
+
+#endif