From: Tony Gutierrez Date: Tue, 19 Jan 2016 19:28:22 +0000 (-0500) Subject: gpu-compute: AMD's baseline GPU model X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=1a7d3f9fcb76a68540dd948f91413533a383bfde;p=gem5.git gpu-compute: AMD's baseline GPU model --- diff --git a/SConstruct b/SConstruct index eadf5d9d2..c291265fc 100755 --- a/SConstruct +++ b/SConstruct @@ -1065,7 +1065,9 @@ main = conf.Finish() # Define the universe of supported ISAs all_isa_list = [ ] +all_gpu_isa_list = [ ] Export('all_isa_list') +Export('all_gpu_isa_list') class CpuModel(object): '''The CpuModel class encapsulates everything the ISA parser needs to @@ -1121,9 +1123,11 @@ for bdir in [ base_dir ] + extras_dir_list: SConscript(joinpath(root, 'SConsopts')) all_isa_list.sort() +all_gpu_isa_list.sort() sticky_vars.AddVariables( EnumVariable('TARGET_ISA', 'Target ISA', 'alpha', all_isa_list), + EnumVariable('TARGET_GPU_ISA', 'Target GPU ISA', 'hsail', all_gpu_isa_list), ListVariable('CPU_MODELS', 'CPU models', sorted(n for n,m in CpuModel.dict.iteritems() if m.default), sorted(CpuModel.dict.keys())), @@ -1139,6 +1143,7 @@ sticky_vars.AddVariables( BoolVariable('USE_FENV', 'Use IEEE mode control', have_fenv), BoolVariable('CP_ANNOTATE', 'Enable critical path annotation capability', False), BoolVariable('USE_KVM', 'Enable hardware virtualized (KVM) CPU models', have_kvm), + BoolVariable('BUILD_GPU', 'Build the compute-GPU model', False), EnumVariable('PROTOCOL', 'Coherence protocol for Ruby', 'None', all_protocols), EnumVariable('BACKTRACE_IMPL', 'Post-mortem dump implementation', @@ -1146,9 +1151,9 @@ sticky_vars.AddVariables( ) # These variables get exported to #defines in config/*.hh (see src/SConscript). -export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'CP_ANNOTATE', - 'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL', 'HAVE_PROTOBUF', - 'HAVE_PERF_ATTR_EXCLUDE_HOST'] +export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'TARGET_GPU_ISA', + 'CP_ANNOTATE', 'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL', + 'HAVE_PROTOBUF', 'HAVE_PERF_ATTR_EXCLUDE_HOST'] ################################################### # @@ -1226,6 +1231,7 @@ main.SConscript('ext/nomali/SConscript', ################################################### main['ALL_ISA_LIST'] = all_isa_list +main['ALL_GPU_ISA_LIST'] = all_gpu_isa_list all_isa_deps = {} def make_switching_dir(dname, switch_headers, env): # Generate the header. target[0] is the full path of the output @@ -1258,6 +1264,35 @@ def make_switching_dir(dname, switch_headers, env): Export('make_switching_dir') +def make_gpu_switching_dir(dname, switch_headers, env): + # Generate the header. target[0] is the full path of the output + # header to generate. 'source' is a dummy variable, since we get the + # list of ISAs from env['ALL_ISA_LIST']. + def gen_switch_hdr(target, source, env): + fname = str(target[0]) + + isa = env['TARGET_GPU_ISA'].lower() + + try: + f = open(fname, 'w') + print >>f, '#include "%s/%s/%s"' % (dname, isa, basename(fname)) + f.close() + except IOError: + print "Failed to create %s" % fname + raise + + # Build SCons Action object. 'varlist' specifies env vars that this + # action depends on; when env['ALL_ISA_LIST'] changes these actions + # should get re-executed. + switch_hdr_action = MakeAction(gen_switch_hdr, + Transform("GENERATE"), varlist=['ALL_ISA_GPU_LIST']) + + # Instantiate actions for each header + for hdr in switch_headers: + env.Command(hdr, [], switch_hdr_action) + +Export('make_gpu_switching_dir') + # all-isas -> all-deps -> all-environs -> all_targets main.Alias('#all-isas', []) main.Alias('#all-deps', '#all-isas') diff --git a/build_opts/HSAIL_X86 b/build_opts/HSAIL_X86 new file mode 100644 index 000000000..105f82cbd --- /dev/null +++ b/build_opts/HSAIL_X86 @@ -0,0 +1,5 @@ +PROTOCOL = 'GPU_RfO' +TARGET_ISA = 'x86' +TARGET_GPU_ISA = 'hsail' +BUILD_GPU = True +CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU' diff --git a/build_opts/X86_MOESI_AMD_Base b/build_opts/X86_MOESI_AMD_Base new file mode 100644 index 000000000..e85f36d82 --- /dev/null +++ b/build_opts/X86_MOESI_AMD_Base @@ -0,0 +1,3 @@ +PROTOCOL = 'MOESI_AMD_Base' +TARGET_ISA = 'x86' +CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU' \ No newline at end of file diff --git a/configs/common/GPUTLBConfig.py b/configs/common/GPUTLBConfig.py new file mode 100644 index 000000000..b7ea6dcf1 --- /dev/null +++ b/configs/common/GPUTLBConfig.py @@ -0,0 +1,203 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +# Configure the TLB hierarchy +# Places which would probably need to be modified if you +# want a different hierarchy are specified by a ' +# comment +import m5 +from m5.objects import * + +def TLB_constructor(level): + + constructor_call = "X86GPUTLB(size = options.L%(level)dTLBentries, \ + assoc = options.L%(level)dTLBassoc, \ + hitLatency = options.L%(level)dAccessLatency,\ + missLatency2 = options.L%(level)dMissLatency,\ + maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\ + accessDistance = options.L%(level)dAccessDistanceStat,\ + clk_domain = SrcClockDomain(\ + clock = options.GPUClock,\ + voltage_domain = VoltageDomain(\ + voltage = options.gpu_voltage)))" % locals() + return constructor_call + +def Coalescer_constructor(level): + + constructor_call = "TLBCoalescer(probesPerCycle = \ + options.L%(level)dProbesPerCycle, \ + coalescingWindow = options.L%(level)dCoalescingWindow,\ + disableCoalescing = options.L%(level)dDisableCoalescing,\ + clk_domain = SrcClockDomain(\ + clock = options.GPUClock,\ + voltage_domain = VoltageDomain(\ + voltage = options.gpu_voltage)))" % locals() + return constructor_call + +def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name): + # arguments: options, TLB level, number of private structures for this Level, + # TLB name and Coalescer name + for i in xrange(my_index): + TLB_name.append(eval(TLB_constructor(my_level))) + Coalescer_name.append(eval(Coalescer_constructor(my_level))) + +def config_tlb_hierarchy(options, system, shader_idx): + n_cu = options.num_compute_units + # Make this configurable now, instead of the hard coded val. The dispatcher + # is always the last item in the system.cpu list. + dispatcher_idx = len(system.cpu) - 1 + + if options.TLB_config == "perLane": + num_TLBs = 64 * n_cu + elif options.TLB_config == "mono": + num_TLBs = 1 + elif options.TLB_config == "perCU": + num_TLBs = n_cu + elif options.TLB_config == "2CU": + num_TLBs = n_cu >> 1 + else: + print "Bad option for TLB Configuration." + sys.exit(1) + + #---------------------------------------------------------------------------------------- + # A visual representation of the TLB hierarchy + # for ease of configuration + # < Modify here the width and the number of levels if you want a different configuration > + # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level + L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []}, + {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}, + {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}] + + L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}] + L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}] + + TLB_hierarchy = [L1, L2, L3] + + #---------------------------------------------------------------------------------------- + # Create the hiearchy + # Call the appropriate constructors and add objects to the system + + for i in xrange(len(TLB_hierarchy)): + hierarchy_level = TLB_hierarchy[i] + level = i+1 + for TLB_type in hierarchy_level: + TLB_index = TLB_type['width'] + TLB_array = TLB_type['TLBarray'] + Coalescer_array = TLB_type['CoalescerArray'] + # If the sim calls for a fixed L1 TLB size across CUs, + # override the TLB entries option + if options.tot_L1TLB_size: + options.L1TLBentries = options.tot_L1TLB_size / num_TLBs + if options.L1TLBassoc > options.L1TLBentries: + options.L1TLBassoc = options.L1TLBentries + # call the constructors for the TLB and the Coalescer + create_TLB_Coalescer(options, level, TLB_index,\ + TLB_array, Coalescer_array) + + system_TLB_name = TLB_type['name'] + '_tlb' + system_Coalescer_name = TLB_type['name'] + '_coalescer' + + # add the different TLB levels to the system + # Modify here if you want to make the TLB hierarchy a child of + # the shader. + exec('system.%s = TLB_array' % system_TLB_name) + exec('system.%s = Coalescer_array' % system_Coalescer_name) + + #=========================================================== + # Specify the TLB hierarchy (i.e., port connections) + # All TLBs but the last level TLB need to have a memSidePort (master) + #=========================================================== + + # Each TLB is connected with its Coalescer through a single port. + # There is a one-to-one mapping of TLBs to Coalescers at a given level + # This won't be modified no matter what the hierarchy looks like. + for i in xrange(len(TLB_hierarchy)): + hierarchy_level = TLB_hierarchy[i] + level = i+1 + for TLB_type in hierarchy_level: + name = TLB_type['name'] + for index in range(TLB_type['width']): + exec('system.%s_coalescer[%d].master[0] = \ + system.%s_tlb[%d].slave[0]' % \ + (name, index, name, index)) + + # Connect the cpuSidePort (slave) of all the coalescers in level 1 + # < Modify here if you want a different configuration > + for TLB_type in L1: + name = TLB_type['name'] + num_TLBs = TLB_type['width'] + if name == 'l1': # L1 D-TLBs + tlb_per_cu = num_TLBs / n_cu + for cu_idx in range(n_cu): + if tlb_per_cu: + for tlb in range(tlb_per_cu): + exec('system.cpu[%d].CUs[%d].translation_port[%d] = \ + system.l1_coalescer[%d].slave[%d]' % \ + (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0)) + else: + exec('system.cpu[%d].CUs[%d].translation_port[%d] = \ + system.l1_coalescer[%d].slave[%d]' % \ + (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs))) + + elif name == 'dispatcher': # Dispatcher TLB + for index in range(TLB_type['width']): + exec('system.cpu[%d].translation_port = \ + system.dispatcher_coalescer[%d].slave[0]' % \ + (dispatcher_idx, index)) + elif name == 'sqc': # I-TLB + for index in range(n_cu): + sqc_tlb_index = index / options.cu_per_sqc + sqc_tlb_port_id = index % options.cu_per_sqc + exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \ + system.sqc_coalescer[%d].slave[%d]' % \ + (shader_idx, index, sqc_tlb_index, sqc_tlb_port_id)) + + + # Connect the memSidePorts (masters) of all the TLBs with the + # cpuSidePorts (slaves) of the Coalescers of the next level + # < Modify here if you want a different configuration > + # L1 <-> L2 + l2_coalescer_index = 0 + for TLB_type in L1: + name = TLB_type['name'] + for index in range(TLB_type['width']): + exec('system.%s_tlb[%d].master[0] = \ + system.l2_coalescer[0].slave[%d]' % \ + (name, index, l2_coalescer_index)) + l2_coalescer_index += 1 + # L2 <-> L3 + system.l2_tlb[0].master[0] = system.l3_coalescer[0].slave[0] + + return system diff --git a/configs/common/GPUTLBOptions.py b/configs/common/GPUTLBOptions.py new file mode 100644 index 000000000..40a46d560 --- /dev/null +++ b/configs/common/GPUTLBOptions.py @@ -0,0 +1,109 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Myrto Papadopoulou +# + +def tlb_options(parser): + + #=================================================================== + # TLB Configuration + #=================================================================== + + parser.add_option("--TLB-config", type="string", default="perCU", + help="Options are: perCU (default), mono, 2CU, or perLane") + + #=================================================================== + # L1 TLB Options (D-TLB, I-TLB, Dispatcher-TLB) + #=================================================================== + + parser.add_option("--L1TLBentries", type='int', default="32") + parser.add_option("--L1TLBassoc", type='int', default="32") + parser.add_option("--L1AccessLatency", type='int', default="1", + help="latency in gpu cycles") + parser.add_option("--L1MissLatency", type='int', default="750", + help="latency (in gpu cycles) of a page walk, " + "if this is a last level TLB") + parser.add_option("--L1MaxOutstandingReqs", type='int', default="64") + parser.add_option("--L1AccessDistanceStat", action="store_true") + parser.add_option("--tot-L1TLB-size", type="int", default="0") + + #=================================================================== + # L2 TLB Options + #=================================================================== + + parser.add_option("--L2TLBentries", type='int', default="4096") + parser.add_option("--L2TLBassoc", type='int', default="32") + parser.add_option("--L2AccessLatency", type='int', default="69", + help="latency in gpu cycles") + parser.add_option("--L2MissLatency", type='int', default="750", + help="latency (in gpu cycles) of a page walk, " + "if this is a last level TLB") + parser.add_option("--L2MaxOutstandingReqs", type='int', default="64") + parser.add_option("--L2AccessDistanceStat", action="store_true") + + #=================================================================== + # L3 TLB Options + #=================================================================== + + parser.add_option("--L3TLBentries", type='int', default="8192") + parser.add_option("--L3TLBassoc", type='int', default="32") + parser.add_option("--L3AccessLatency", type='int', default="150", + help="latency in gpu cycles") + parser.add_option("--L3MissLatency", type='int', default="750", + help="latency (in gpu cycles) of a page walk") + parser.add_option("--L3MaxOutstandingReqs", type='int', default="64") + parser.add_option("--L3AccessDistanceStat", action="store_true") + + #=================================================================== + # L1 TLBCoalescer Options + #=================================================================== + + parser.add_option("--L1ProbesPerCycle", type='int', default="2") + parser.add_option("--L1CoalescingWindow", type='int', default="1") + parser.add_option("--L1DisableCoalescing", action="store_true") + + #=================================================================== + # L2 TLBCoalescer Options + #=================================================================== + + parser.add_option("--L2ProbesPerCycle", type='int', default="2") + parser.add_option("--L2CoalescingWindow", type='int', default="1") + parser.add_option("--L2DisableCoalescing", action="store_true") + + #=================================================================== + # L3 TLBCoalescer Options + #=================================================================== + + parser.add_option("--L3ProbesPerCycle", type='int', default="2") + parser.add_option("--L3CoalescingWindow", type='int', default="1") + parser.add_option("--L3DisableCoalescing", action="store_true") diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py new file mode 100644 index 000000000..75819b505 --- /dev/null +++ b/configs/example/apu_se.py @@ -0,0 +1,499 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Sooraj Puthoor +# + +import optparse, os, re +import math +import glob +import inspect + +import m5 +from m5.objects import * +from m5.util import addToPath + +addToPath('../ruby') +addToPath('../common') +addToPath('../topologies') + +import Options +import Ruby +import Simulation +import GPUTLBOptions, GPUTLBConfig + +########################## Script Options ######################## +def setOption(parser, opt_str, value = 1): + # check to make sure the option actually exists + if not parser.has_option(opt_str): + raise Exception("cannot find %s in list of possible options" % opt_str) + + opt = parser.get_option(opt_str) + # set the value + exec("parser.values.%s = %s" % (opt.dest, value)) + +def getOption(parser, opt_str): + # check to make sure the option actually exists + if not parser.has_option(opt_str): + raise Exception("cannot find %s in list of possible options" % opt_str) + + opt = parser.get_option(opt_str) + # get the value + exec("return_value = parser.values.%s" % opt.dest) + return return_value + +# Adding script options +parser = optparse.OptionParser() +Options.addCommonOptions(parser) +Options.addSEOptions(parser) + +parser.add_option("--cpu-only-mode", action="store_true", default=False, + help="APU mode. Used to take care of problems in "\ + "Ruby.py while running APU protocols") +parser.add_option("-k", "--kernel-files", + help="file(s) containing GPU kernel code (colon separated)") +parser.add_option("-u", "--num-compute-units", type="int", default=1, + help="number of GPU compute units"), +parser.add_option("--num-cp", type="int", default=0, + help="Number of GPU Command Processors (CP)") +parser.add_option("--benchmark-root", help="Root of benchmark directory tree") + +# not super important now, but to avoid putting the number 4 everywhere, make +# it an option/knob +parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \ + "sharing an SQC (icache, and thus icache TLB)") +parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ + "per CU") +parser.add_option("--wf-size", type="int", default=64, + help="Wavefront size(in workitems)") +parser.add_option("--sp-bypass-path-length", type="int", default=4, \ + help="Number of stages of bypass path in vector ALU for Single Precision ops") +parser.add_option("--dp-bypass-path-length", type="int", default=4, \ + help="Number of stages of bypass path in vector ALU for Double Precision ops") +# issue period per SIMD unit: number of cycles before issuing another vector +parser.add_option("--issue-period", type="int", default=4, \ + help="Number of cycles per vector instruction issue period") +parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \ + help="VGPR to Coalescer (Global Memory) data bus width in bytes") +parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \ + help="Coalescer to VGPR (Global Memory) data bus width in bytes") +# Currently we only support 1 local memory pipe +parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \ + help="Number of Shared Memory pipelines per CU") +# Currently we only support 1 global memory pipe +parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \ + help="Number of Global Memory pipelines per CU") +parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \ + "WF slots per SIMD") + +parser.add_option("--vreg-file-size", type="int", default=2048, + help="number of physical vector registers per SIMD") +parser.add_option("--bw-scalor", type="int", default=0, + help="bandwidth scalor for scalability analysis") +parser.add_option("--CPUClock", type="string", default="2GHz", + help="CPU clock") +parser.add_option("--GPUClock", type="string", default="1GHz", + help="GPU clock") +parser.add_option("--cpu-voltage", action="store", type="string", + default='1.0V', + help = """CPU voltage domain""") +parser.add_option("--gpu-voltage", action="store", type="string", + default='1.0V', + help = """CPU voltage domain""") +parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST", + help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)") +parser.add_option("--xact-cas-mode", action="store_true", + help="enable load_compare mode (transactional CAS)") +parser.add_option("--SegFaultDebug",action="store_true", + help="checks for GPU seg fault before TLB access") +parser.add_option("--FunctionalTLB",action="store_true", + help="Assumes TLB has no latency") +parser.add_option("--LocalMemBarrier",action="store_true", + help="Barrier does not wait for writethroughs to complete") +parser.add_option("--countPages", action="store_true", + help="Count Page Accesses and output in per-CU output files") +parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\ + "TLBs") +parser.add_option("--pf-type", type="string", help="type of prefetch: "\ + "PF_CU, PF_WF, PF_PHASE, PF_STRIDE") +parser.add_option("--pf-stride", type="int", help="set prefetch stride") +parser.add_option("--numLdsBanks", type="int", default=32, + help="number of physical banks per LDS module") +parser.add_option("--ldsBankConflictPenalty", type="int", default=1, + help="number of cycles per LDS bank conflict") + + +Ruby.define_options(parser) + +#add TLB options to the parser +GPUTLBOptions.tlb_options(parser) + +(options, args) = parser.parse_args() + +# The GPU cache coherence protocols only work with the backing store +setOption(parser, "--access-backing-store") + +# if benchmark root is specified explicitly, that overrides the search path +if options.benchmark_root: + benchmark_path = [options.benchmark_root] +else: + # Set default benchmark search path to current dir + benchmark_path = ['.'] + +########################## Sanity Check ######################## + +# Currently the gpu model requires ruby +if buildEnv['PROTOCOL'] == 'None': + fatal("GPU model requires ruby") + +# Currently the gpu model requires only timing or detailed CPU +if not (options.cpu_type == "timing" or + options.cpu_type == "detailed"): + fatal("GPU model requires timing or detailed CPU") + +# This file can support multiple compute units +assert(options.num_compute_units >= 1) + +# Currently, the sqc (I-Cache of GPU) is shared by +# multiple compute units(CUs). The protocol works just fine +# even if sqc is not shared. Overriding this option here +# so that the user need not explicitly set this (assuming +# sharing sqc is the common usage) +n_cu = options.num_compute_units +num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc)) +options.num_sqc = num_sqc # pass this to Ruby + +########################## Creating the GPU system ######################## +# shader is the GPU +shader = Shader(n_wf = options.wfs_per_simd, + clk_domain = SrcClockDomain( + clock = options.GPUClock, + voltage_domain = VoltageDomain( + voltage = options.gpu_voltage))) + +# GPU_RfO(Read For Ownership) implements SC/TSO memory model. +# Other GPU protocols implement release consistency at GPU side. +# So, all GPU protocols other than GPU_RfO should make their writes +# visible to the global memory and should read from global memory +# during kernal boundary. The pipeline initiates(or do not initiate) +# the acquire/release operation depending on this impl_kern_boundary_sync +# flag. This flag=true means pipeline initiates a acquire/release operation +# at kernel boundary. +if buildEnv['PROTOCOL'] == 'GPU_RfO': + shader.impl_kern_boundary_sync = False +else: + shader.impl_kern_boundary_sync = True + +# Switching off per-lane TLB by default +per_lane = False +if options.TLB_config == "perLane": + per_lane = True + +# List of compute units; one GPU can have multiple compute units +compute_units = [] +for i in xrange(n_cu): + compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, + num_SIMDs = options.simds_per_cu, + wfSize = options.wf_size, + spbypass_pipe_length = options.sp_bypass_path_length, + dpbypass_pipe_length = options.dp_bypass_path_length, + issue_period = options.issue_period, + coalescer_to_vrf_bus_width = \ + options.glbmem_rd_bus_width, + vrf_to_coalescer_bus_width = \ + options.glbmem_wr_bus_width, + num_global_mem_pipes = \ + options.glb_mem_pipes_per_cu, + num_shared_mem_pipes = \ + options.shr_mem_pipes_per_cu, + n_wf = options.wfs_per_simd, + execPolicy = options.CUExecPolicy, + xactCasMode = options.xact_cas_mode, + debugSegFault = options.SegFaultDebug, + functionalTLB = options.FunctionalTLB, + localMemBarrier = options.LocalMemBarrier, + countPages = options.countPages, + localDataStore = \ + LdsState(banks = options.numLdsBanks, + bankConflictPenalty = \ + options.ldsBankConflictPenalty))) + wavefronts = [] + vrfs = [] + for j in xrange(options.simds_per_cu): + for k in xrange(shader.n_wf): + wavefronts.append(Wavefront(simdId = j, wf_slot_id = k)) + vrfs.append(VectorRegisterFile(simd_id=j, + num_regs_per_simd=options.vreg_file_size)) + compute_units[-1].wavefronts = wavefronts + compute_units[-1].vector_register_file = vrfs + if options.TLB_prefetch: + compute_units[-1].prefetch_depth = options.TLB_prefetch + compute_units[-1].prefetch_prev_type = options.pf_type + + # attach the LDS and the CU to the bus (actually a Bridge) + compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave + compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort + +# Attach compute units to GPU +shader.CUs = compute_units + +########################## Creating the CPU system ######################## +options.num_cpus = options.num_cpus + +# The shader core will be whatever is after the CPU cores are accounted for +shader_idx = options.num_cpus + +# The command processor will be whatever is after the shader is accounted for +cp_idx = shader_idx + 1 +cp_list = [] + +# List of CPUs +cpu_list = [] + +# We only support timing mode for shader and memory +shader.timing = True +mem_mode = 'timing' + +# create the cpus +for i in range(options.num_cpus): + cpu = None + if options.cpu_type == "detailed": + cpu = DerivO3CPU(cpu_id=i, + clk_domain = SrcClockDomain( + clock = options.CPUClock, + voltage_domain = VoltageDomain( + voltage = options.cpu_voltage))) + elif options.cpu_type == "timing": + cpu = TimingSimpleCPU(cpu_id=i, + clk_domain = SrcClockDomain( + clock = options.CPUClock, + voltage_domain = VoltageDomain( + voltage = options.cpu_voltage))) + else: + fatal("Atomic CPU not supported/tested") + cpu_list.append(cpu) + +# create the command processors +for i in xrange(options.num_cp): + cp = None + if options.cpu_type == "detailed": + cp = DerivO3CPU(cpu_id = options.num_cpus + i, + clk_domain = SrcClockDomain( + clock = options.CPUClock, + voltage_domain = VoltageDomain( + voltage = options.cpu_voltage))) + elif options.cpu_type == 'timing': + cp = TimingSimpleCPU(cpu_id=options.num_cpus + i, + clk_domain = SrcClockDomain( + clock = options.CPUClock, + voltage_domain = VoltageDomain( + voltage = options.cpu_voltage))) + else: + fatal("Atomic CPU not supported/tested") + cp_list = cp_list + [cp] + +########################## Creating the GPU dispatcher ######################## +# Dispatcher dispatches work from host CPU to GPU +host_cpu = cpu_list[0] +dispatcher = GpuDispatcher() + +########################## Create and assign the workload ######################## +# Check for rel_path in elements of base_list using test, returning +# the first full path that satisfies test +def find_path(base_list, rel_path, test): + for base in base_list: + if not base: + # base could be None if environment var not set + continue + full_path = os.path.join(base, rel_path) + if test(full_path): + return full_path + fatal("%s not found in %s" % (rel_path, base_list)) + +def find_file(base_list, rel_path): + return find_path(base_list, rel_path, os.path.isfile) + +executable = find_path(benchmark_path, options.cmd, os.path.exists) +# it's common for a benchmark to be in a directory with the same +# name as the executable, so we handle that automatically +if os.path.isdir(executable): + benchmark_path = [executable] + executable = find_file(benchmark_path, options.cmd) +if options.kernel_files: + kernel_files = [find_file(benchmark_path, f) + for f in options.kernel_files.split(':')] +else: + # if kernel_files is not set, see if there's a unique .asm file + # in the same directory as the executable + kernel_path = os.path.dirname(executable) + kernel_files = glob.glob(os.path.join(kernel_path, '*.asm')) + if kernel_files: + print "Using GPU kernel code file(s)", ",".join(kernel_files) + else: + fatal("Can't locate kernel code (.asm) in " + kernel_path) + +# OpenCL driver +driver = ClDriver(filename="hsa", codefile=kernel_files) +for cpu in cpu_list: + cpu.workload = LiveProcess(executable = executable, + cmd = [options.cmd] + options.options.split(), + drivers = [driver]) +for cp in cp_list: + cp.workload = host_cpu.workload + +########################## Create the overall system ######################## +# Full list of processing cores in the system. Note that +# dispatcher is also added to cpu_list although it is +# not a processing element +cpu_list = cpu_list + [shader] + cp_list + [dispatcher] + +# creating the overall system +# notice the cpu list is explicitly added as a parameter to System +system = System(cpu = cpu_list, + mem_ranges = [AddrRange(options.mem_size)], + cache_line_size = options.cacheline_size, + mem_mode = mem_mode) +system.voltage_domain = VoltageDomain(voltage = options.sys_voltage) +system.clk_domain = SrcClockDomain(clock = options.sys_clock, + voltage_domain = system.voltage_domain) + +# configure the TLB hierarchy +GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) + +# create Ruby system +system.piobus = IOXBar(width=32, response_latency=0, + frontend_latency=0, forward_latency=0) +Ruby.create_system(options, None, system) +system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock, + voltage_domain = system.voltage_domain) + +# attach the CPU ports to Ruby +for i in range(options.num_cpus): + ruby_port = system.ruby._cpu_ports[i] + + # Create interrupt controller + system.cpu[i].createInterruptController() + + # Connect cache port's to ruby + system.cpu[i].icache_port = ruby_port.slave + system.cpu[i].dcache_port = ruby_port.slave + + ruby_port.mem_master_port = system.piobus.slave + if buildEnv['TARGET_ISA'] == "x86": + system.cpu[i].interrupts[0].pio = system.piobus.master + system.cpu[i].interrupts[0].int_master = system.piobus.slave + system.cpu[i].interrupts[0].int_slave = system.piobus.master + +# attach CU ports to Ruby +# Because of the peculiarities of the CP core, you may have 1 CPU but 2 +# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be +# hooked up until after the CP. To make this script generic, figure out +# the index as below, but note that this assumes there is one sequencer +# per compute unit and one sequencer per SQC for the math to work out +# correctly. +gpu_port_idx = len(system.ruby._cpu_ports) \ + - options.num_compute_units - options.num_sqc +gpu_port_idx = gpu_port_idx - options.num_cp * 2 + +wavefront_size = options.wf_size +for i in xrange(n_cu): + # The pipeline issues wavefront_size number of uncoalesced requests + # in one GPU issue cycle. Hence wavefront_size mem ports. + for j in xrange(wavefront_size): + system.cpu[shader_idx].CUs[i].memory_port[j] = \ + system.ruby._cpu_ports[gpu_port_idx].slave[j] + gpu_port_idx += 1 + +for i in xrange(n_cu): + if i > 0 and not i % options.cu_per_sqc: + print "incrementing idx on ", i + gpu_port_idx += 1 + system.cpu[shader_idx].CUs[i].sqc_port = \ + system.ruby._cpu_ports[gpu_port_idx].slave +gpu_port_idx = gpu_port_idx + 1 + +# attach CP ports to Ruby +for i in xrange(options.num_cp): + system.cpu[cp_idx].createInterruptController() + system.cpu[cp_idx].dcache_port = \ + system.ruby._cpu_ports[gpu_port_idx + i * 2].slave + system.cpu[cp_idx].icache_port = \ + system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave + system.cpu[cp_idx].interrupts[0].pio = system.piobus.master + system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave + system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master + cp_idx = cp_idx + 1 + +# connect dispatcher to the system.piobus +dispatcher.pio = system.piobus.master +dispatcher.dma = system.piobus.slave + +################# Connect the CPU and GPU via GPU Dispatcher ################### +# CPU rings the GPU doorbell to notify a pending task +# using this interface. +# And GPU uses this interface to notify the CPU of task completion +# The communcation happens through emulated driver. + +# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array +# parameters must be after the explicit setting of the System cpu list +shader.cpu_pointer = host_cpu +dispatcher.cpu = host_cpu +dispatcher.shader_pointer = shader +dispatcher.cl_driver = driver + +########################## Start simulation ######################## + +root = Root(system=system, full_system=False) +m5.ticks.setGlobalFrequency('1THz') +if options.abs_max_tick: + maxtick = options.abs_max_tick +else: + maxtick = m5.MaxTick + +# Benchmarks support work item annotations +Simulation.setWorkCountOptions(system, options) + +# Checkpointing is not supported by APU model +if (options.checkpoint_dir != None or + options.checkpoint_restore != None): + fatal("Checkpointing not supported by apu model") + +checkpoint_dir = None +m5.instantiate(checkpoint_dir) + +# Map workload to this address space +host_cpu.workload[0].map(0x10000000, 0x200000000, 4096) + +exit_event = m5.simulate(maxtick) +print "Ticks:", m5.curTick() +print 'Exiting because ', exit_event.getCause() +sys.exit(exit_event.getCode()) diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py new file mode 100644 index 000000000..66ee4675f --- /dev/null +++ b/configs/example/ruby_gpu_random_test.py @@ -0,0 +1,187 @@ +# +# Copyright (c) 2010-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Brad Beckmann +# + +import m5 +from m5.objects import * +from m5.defines import buildEnv +from m5.util import addToPath +import os, optparse, sys +addToPath('../common') +addToPath('../ruby') +addToPath('../topologies') + +import Options +import Ruby + +# Get paths we might need. +config_path = os.path.dirname(os.path.abspath(__file__)) +config_root = os.path.dirname(config_path) +m5_root = os.path.dirname(config_root) + +parser = optparse.OptionParser() +Options.addCommonOptions(parser) + +parser.add_option("--maxloads", metavar="N", default=100, + help="Stop after N loads") +parser.add_option("-f", "--wakeup_freq", metavar="N", default=10, + help="Wakeup every N cycles") +parser.add_option("-u", "--num-compute-units", type="int", default=1, + help="number of compute units in the GPU") +parser.add_option("--numCPs", type="int", default=0, + help="Number of GPU Command Processors (CP)") +# not super important now, but to avoid putting the number 4 everywhere, make +# it an option/knob +parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \ + sharing an SQC (icache, and thus icache TLB)") +parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ + "per CU") +parser.add_option("--wf-size", type="int", default=64, + help="Wavefront size(in workitems)") +parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \ + "WF slots per SIMD") + +# +# Add the ruby specific and protocol specific options +# +Ruby.define_options(parser) + +execfile(os.path.join(config_root, "common", "Options.py")) + +(options, args) = parser.parse_args() + +# +# Set the default cache size and associativity to be very small to encourage +# races between requests and writebacks. +# +options.l1d_size="256B" +options.l1i_size="256B" +options.l2_size="512B" +options.l3_size="1kB" +options.l1d_assoc=2 +options.l1i_assoc=2 +options.l2_assoc=2 +options.l3_assoc=2 + +# This file can support multiple compute units +assert(options.num_compute_units >= 1) +n_cu = options.num_compute_units + +options.num_sqc = int((n_cu + options.cu_per_sqc - 1) / options.cu_per_sqc) + +if args: + print "Error: script doesn't take any positional arguments" + sys.exit(1) + +# +# Create the ruby random tester +# + +# Check to for the GPU_RfO protocol. Other GPU protocols are non-SC and will +# not work with the Ruby random tester. +assert(buildEnv['PROTOCOL'] == 'GPU_RfO') + +# The GPU_RfO protocol does not support cache flushes +check_flush = False + +tester = RubyTester(check_flush=check_flush, + checks_to_complete=options.maxloads, + wakeup_frequency=options.wakeup_freq, + deadlock_threshold=1000000) + +# +# Create the M5 system. Note that the Memory Object isn't +# actually used by the rubytester, but is included to support the +# M5 memory size == Ruby memory size checks +# +system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)]) + +# Create a top-level voltage domain and clock domain +system.voltage_domain = VoltageDomain(voltage=options.sys_voltage) + +system.clk_domain = SrcClockDomain(clock=options.sys_clock, + voltage_domain=system.voltage_domain) + +Ruby.create_system(options, False, system) + +# Create a seperate clock domain for Ruby +system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock, + voltage_domain=system.voltage_domain) + +tester.num_cpus = len(system.ruby._cpu_ports) + +# +# The tester is most effective when randomization is turned on and +# artifical delay is randomly inserted on messages +# +system.ruby.randomization = True + +for ruby_port in system.ruby._cpu_ports: + + # + # Tie the ruby tester ports to the ruby cpu read and write ports + # + if ruby_port.support_data_reqs and ruby_port.support_inst_reqs: + tester.cpuInstDataPort = ruby_port.slave + elif ruby_port.support_data_reqs: + tester.cpuDataPort = ruby_port.slave + elif ruby_port.support_inst_reqs: + tester.cpuInstPort = ruby_port.slave + + # Do not automatically retry stalled Ruby requests + ruby_port.no_retry_on_stall = True + + # + # Tell each sequencer this is the ruby tester so that it + # copies the subblock back to the checker + # + ruby_port.using_ruby_tester = True + +# ----------------------- +# run simulation +# ----------------------- + +root = Root( full_system = False, system = system ) +root.system.mem_mode = 'timing' + +# Not much point in this being higher than the L1 latency +m5.ticks.setGlobalFrequency('1ns') + +# instantiate configuration +m5.instantiate() + +# simulate until program terminates +exit_event = m5.simulate(options.abs_max_tick) + +print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause() diff --git a/configs/ruby/AMD_Base_Constructor.py b/configs/ruby/AMD_Base_Constructor.py new file mode 100644 index 000000000..d13153e9a --- /dev/null +++ b/configs/ruby/AMD_Base_Constructor.py @@ -0,0 +1,134 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Sooraj Puthoor, Lisa Hsu +# + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from m5.util import convert +from CntrlBase import * +from Cluster import Cluster + +# +# Note: the L1 Cache latency is only used by the sequencer on fast path hits +# +class L1Cache(RubyCache): + latency = 1 + resourceStalls = False + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +# +# Note: the L2 Cache latency is not currently used +# +class L2Cache(RubyCache): + latency = 10 + resourceStalls = False + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() +class CPCntrl(AMD_Base_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.cntrl_id = self.cntrlCount() + + self.L1Icache = L1Cache() + self.L1Icache.create(options.l1i_size, options.l1i_assoc, options) + self.L1D0cache = L1Cache() + self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options) + self.L1D1cache = L1Cache() + self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options) + self.L2cache = L2Cache() + self.L2cache.create(options.l2_size, options.l2_assoc, options) + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1Icache + self.sequencer.dcache = self.L1D0cache + self.sequencer.ruby_system = ruby_system + self.sequencer.coreid = 0 + self.sequencer.is_cpu_sequencer = True + + self.sequencer1 = RubySequencer() + self.sequencer1.version = self.seqCount() + self.sequencer1.icache = self.L1Icache + self.sequencer1.dcache = self.L1D1cache + self.sequencer1.ruby_system = ruby_system + self.sequencer1.coreid = 1 + self.sequencer1.is_cpu_sequencer = True + + self.issue_latency = options.cpu_to_dir_latency + self.send_evictions = send_evicts(options) + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +def define_options(parser): + parser.add_option("--cpu-to-dir-latency", type="int", default=15) + +def construct(options, system, ruby_system): + if (buildEnv['PROTOCOL'] != 'GPU_VIPER' or + buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or + buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'): + panic("This script requires VIPER based protocols \ + to be built.") + cpu_sequencers = [] + cpuCluster = None + cpuCluster = Cluster(name="CPU Cluster", extBW = 8, intBW=8) # 16 GB/s + for i in xrange((options.num_cpus + 1) / 2): + + cp_cntrl = CPCntrl() + cp_cntrl.create(options, ruby_system, system) + + # Connect the CP controllers to the ruby network + cp_cntrl.requestFromCore = ruby_system.network.slave + cp_cntrl.responseFromCore = ruby_system.network.slave + cp_cntrl.unblockFromCore = ruby_system.network.slave + cp_cntrl.probeToCore = ruby_system.network.master + cp_cntrl.responseToCore = ruby_system.network.master + + exec("system.cp_cntrl%d = cp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1]) + cpuCluster.add(cp_cntrl) + return cpu_sequencers, cpuCluster diff --git a/configs/ruby/GPU_RfO.py b/configs/ruby/GPU_RfO.py new file mode 100644 index 000000000..bb14252f3 --- /dev/null +++ b/configs/ruby/GPU_RfO.py @@ -0,0 +1,751 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from Ruby import create_topology +from Ruby import send_evicts + +from Cluster import Cluster +from Crossbar import Crossbar + +class CntrlBase: + _seqs = 0 + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + CntrlBase._seqs += 1 + return CntrlBase._seqs - 1 + + _cntrls = 0 + @classmethod + def cntrlCount(cls): + # Use CntlCount not class since we need global count + CntrlBase._cntrls += 1 + return CntrlBase._cntrls - 1 + + _version = 0 + @classmethod + def versionCount(cls): + cls._version += 1 # Use count for this particular type + return cls._version - 1 + +class TccDirCache(RubyCache): + size = "512kB" + assoc = 16 + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.tcc_size) + self.size.value += (options.num_compute_units * + (MemorySize(options.tcp_size).value) * + options.tcc_dir_factor) / long(options.num_tccs) + self.start_index_bit = math.log(options.cacheline_size, 2) + \ + math.log(options.num_tccs, 2) + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L1DCache(RubyCache): + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.l1d_size) + self.assoc = options.l1d_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L1ICache(RubyCache): + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.l1i_size) + self.assoc = options.l1i_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L2Cache(RubyCache): + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.l2_size) + self.assoc = options.l2_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + + +class CPCntrl(CorePair_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1Icache = L1ICache() + self.L1Icache.create(options) + self.L1D0cache = L1DCache() + self.L1D0cache.create(options) + self.L1D1cache = L1DCache() + self.L1D1cache.create(options) + self.L2cache = L2Cache() + self.L2cache.create(options) + + self.sequencer = RubySequencer() + self.sequencer.icache_hit_latency = 2 + self.sequencer.dcache_hit_latency = 2 + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1Icache + self.sequencer.dcache = self.L1D0cache + self.sequencer.ruby_system = ruby_system + self.sequencer.coreid = 0 + self.sequencer.is_cpu_sequencer = True + + self.sequencer1 = RubySequencer() + self.sequencer1.version = self.seqCount() + self.sequencer1.icache = self.L1Icache + self.sequencer1.dcache = self.L1D1cache + self.sequencer1.icache_hit_latency = 2 + self.sequencer1.dcache_hit_latency = 2 + self.sequencer1.ruby_system = ruby_system + self.sequencer1.coreid = 1 + self.sequencer1.is_cpu_sequencer = True + + self.issue_latency = options.cpu_to_dir_latency + self.send_evictions = send_evicts(options) + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCPCache(RubyCache): + assoc = 8 + dataArrayBanks = 16 + tagArrayBanks = 4 + dataAccessLatency = 4 + tagAccessLatency = 1 + def create(self, options): + self.size = MemorySize(options.tcp_size) + self.replacement_policy = PseudoLRUReplacementPolicy() + +class TCPCntrl(TCP_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency) + self.L1cache.resourceStalls = options.no_resource_stalls + self.L1cache.create(options) + + self.coalescer = RubyGPUCoalescer() + self.coalescer.version = self.seqCount() + self.coalescer.icache = self.L1cache + self.coalescer.dcache = self.L1cache + self.coalescer.ruby_system = ruby_system + self.coalescer.support_inst_reqs = False + self.coalescer.is_cpu_sequencer = False + self.coalescer.max_outstanding_requests = options.simds_per_cu * \ + options.wfs_per_simd * \ + options.wf_size + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.is_cpu_sequencer = True + + self.use_seq_not_coal = False + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def createCP(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency) + self.L1cache.resourceStalls = options.no_resource_stalls + self.L1cache.create(options) + + self.coalescer = RubyGPUCoalescer() + self.coalescer.version = self.seqCount() + self.coalescer.icache = self.L1cache + self.coalescer.dcache = self.L1cache + self.coalescer.ruby_system = ruby_system + self.coalescer.support_inst_reqs = False + self.coalescer.is_cpu_sequencer = False + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.is_cpu_sequencer = True + + self.use_seq_not_coal = True + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class SQCCache(RubyCache): + size = "32kB" + assoc = 8 + dataArrayBanks = 16 + tagArrayBanks = 4 + dataAccessLatency = 4 + tagAccessLatency = 1 + def create(self, options): + self.replacement_policy = PseudoLRUReplacementPolicy() + +class SQCCntrl(SQC_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = SQCCache() + self.L1cache.create(options) + self.L1cache.resourceStalls = options.no_resource_stalls + + self.sequencer = RubySequencer() + + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.support_data_reqs = False + self.sequencer.is_cpu_sequencer = False + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def createCP(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = SQCCache() + self.L1cache.create(options) + self.L1cache.resourceStalls = options.no_resource_stalls + + self.sequencer = RubySequencer() + + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.support_data_reqs = False + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + +class TCC(RubyCache): + assoc = 16 + dataAccessLatency = 8 + tagAccessLatency = 2 + resourceStalls = True + def create(self, options): + self.size = MemorySize(options.tcc_size) + self.size = self.size / options.num_tccs + self.dataArrayBanks = 256 / options.num_tccs #number of data banks + self.tagArrayBanks = 256 / options.num_tccs #number of tag banks + if ((self.size.value / long(self.assoc)) < 128): + self.size.value = long(128 * self.assoc) + self.start_index_bit = math.log(options.cacheline_size, 2) + \ + math.log(options.num_tccs, 2) + self.replacement_policy = PseudoLRUReplacementPolicy() + +class TCCCntrl(TCC_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L2cache = TCC() + self.L2cache.create(options) + self.l2_response_latency = options.TCC_latency + + self.number_of_TBEs = 2048 + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir, + tcc_unblock_to_tccdir, req_to_tcc, + probe_to_tcc, resp_to_tcc): + self.w_reqToTCCDir = req_to_tccdir + self.w_respToTCCDir = resp_to_tccdir + self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir + self.w_reqToTCC = req_to_tcc + self.w_probeToTCC = probe_to_tcc + self.w_respToTCC = resp_to_tcc + +class TCCDirCntrl(TCCdir_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.directory = TccDirCache() + self.directory.create(options) + + self.number_of_TBEs = 1024 + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir, + tcc_unblock_to_tccdir, req_to_tcc, + probe_to_tcc, resp_to_tcc): + self.w_reqToTCCDir = req_to_tccdir + self.w_respToTCCDir = resp_to_tccdir + self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir + self.w_reqToTCC = req_to_tcc + self.w_probeToTCC = probe_to_tcc + self.w_respToTCC = resp_to_tcc + +class L3Cache(RubyCache): + assoc = 8 + dataArrayBanks = 256 + tagArrayBanks = 256 + + def create(self, options, ruby_system, system): + self.size = MemorySize(options.l3_size) + self.size.value /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataAccessLatency = options.l3_data_latency + self.tagAccessLatency = options.l3_tag_latency + self.resourceStalls = options.no_resource_stalls + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L3Cntrl(L3Cache_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L3cache = L3Cache() + self.L3cache.create(options, ruby_system, system) + + self.l3_response_latency = max(self.L3cache.dataAccessLatency, + self.L3cache.tagAccessLatency) + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +class DirMem(RubyDirectoryMemory, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + phys_mem_size = AddrRange(options.mem_size).size() + mem_module_size = phys_mem_size / options.num_dirs + dir_size = MemorySize('0B') + dir_size.value = mem_module_size + self.size = dir_size + +class DirCntrl(Directory_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.response_latency = 30 + + self.directory = DirMem() + self.directory.create(options, ruby_system, system) + + self.L3CacheMemory = L3Cache() + self.L3CacheMemory.create(options, ruby_system, system) + + self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency, + self.L3CacheMemory.tagAccessLatency) + + self.number_of_TBEs = options.num_tbes + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + + + +def define_options(parser): + parser.add_option("--num-subcaches", type="int", default=4) + parser.add_option("--l3-data-latency", type="int", default=20) + parser.add_option("--l3-tag-latency", type="int", default=15) + parser.add_option("--cpu-to-dir-latency", type="int", default=15) + parser.add_option("--gpu-to-dir-latency", type="int", default=160) + parser.add_option("--no-resource-stalls", action="store_false", + default=True) + parser.add_option("--num-tbes", type="int", default=256) + parser.add_option("--l2-latency", type="int", default=50) # load to use + parser.add_option("--num-tccs", type="int", default=1, + help="number of TCC directories and banks in the GPU") + parser.add_option("--TCP_latency", type="int", default=4, + help="TCP latency") + parser.add_option("--TCC_latency", type="int", default=16, + help="TCC latency") + parser.add_option("--tcc-size", type='string', default='256kB', + help="agregate tcc size") + parser.add_option("--tcp-size", type='string', default='16kB', + help="tcp size") + parser.add_option("--tcc-dir-factor", type='int', default=4, + help="TCCdir size = factor *(TCPs + TCC)") + +def create_system(options, full_system, system, dma_devices, ruby_system): + if buildEnv['PROTOCOL'] != 'GPU_RfO': + panic("This script requires the GPU_RfO protocol to be built.") + + cpu_sequencers = [] + + # + # The ruby network creation expects the list of nodes in the system to be + # consistent with the NetDest list. Therefore the l1 controller nodes + # must be listed before the directory nodes and directory nodes before + # dma nodes, etc. + # + cp_cntrl_nodes = [] + tcp_cntrl_nodes = [] + sqc_cntrl_nodes = [] + tcc_cntrl_nodes = [] + tccdir_cntrl_nodes = [] + dir_cntrl_nodes = [] + l3_cntrl_nodes = [] + + # + # Must create the individual controllers before the network to ensure the + # controller constructors are called before the network constructor + # + + TCC_bits = int(math.log(options.num_tccs, 2)) + + # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu + # Clusters + mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s + for i in xrange(options.num_dirs): + + dir_cntrl = DirCntrl(TCC_select_num_bits = TCC_bits) + dir_cntrl.create(options, ruby_system, system) + dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units + #Enough TBEs for all TCP TBEs + + # Connect the Directory controller to the ruby network + dir_cntrl.requestFromCores = MessageBuffer(ordered = True) + dir_cntrl.requestFromCores.slave = ruby_system.network.master + + dir_cntrl.responseFromCores = MessageBuffer() + dir_cntrl.responseFromCores.slave = ruby_system.network.master + + dir_cntrl.unblockFromCores = MessageBuffer() + dir_cntrl.unblockFromCores.slave = ruby_system.network.master + + dir_cntrl.probeToCore = MessageBuffer() + dir_cntrl.probeToCore.master = ruby_system.network.slave + + dir_cntrl.responseToCore = MessageBuffer() + dir_cntrl.responseToCore.master = ruby_system.network.slave + + dir_cntrl.triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.responseFromMemory = MessageBuffer() + + exec("system.dir_cntrl%d = dir_cntrl" % i) + dir_cntrl_nodes.append(dir_cntrl) + + mainCluster.add(dir_cntrl) + + # For an odd number of CPUs, still create the right number of controllers + cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s + for i in xrange((options.num_cpus + 1) / 2): + + cp_cntrl = CPCntrl() + cp_cntrl.create(options, ruby_system, system) + + exec("system.cp_cntrl%d = cp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1]) + + # Connect the CP controllers and the network + cp_cntrl.requestFromCore = MessageBuffer() + cp_cntrl.requestFromCore.master = ruby_system.network.slave + + cp_cntrl.responseFromCore = MessageBuffer() + cp_cntrl.responseFromCore.master = ruby_system.network.slave + + cp_cntrl.unblockFromCore = MessageBuffer() + cp_cntrl.unblockFromCore.master = ruby_system.network.slave + + cp_cntrl.probeToCore = MessageBuffer() + cp_cntrl.probeToCore.slave = ruby_system.network.master + + cp_cntrl.responseToCore = MessageBuffer() + cp_cntrl.responseToCore.slave = ruby_system.network.master + + cp_cntrl.mandatoryQueue = MessageBuffer() + cp_cntrl.triggerQueue = MessageBuffer(ordered = True) + + cpuCluster.add(cp_cntrl) + + gpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s + + for i in xrange(options.num_compute_units): + + tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits, + number_of_TBEs = 2560) # max outstanding requests + tcp_cntrl.create(options, ruby_system, system) + + exec("system.tcp_cntrl%d = tcp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(tcp_cntrl.coalescer) + tcp_cntrl_nodes.append(tcp_cntrl) + + # Connect the TCP controller to the ruby network + tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.requestFromTCP.master = ruby_system.network.slave + + tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseFromTCP.master = ruby_system.network.slave + + tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True) + tcp_cntrl.unblockFromCore.master = ruby_system.network.slave + + tcp_cntrl.probeToTCP = MessageBuffer(ordered = True) + tcp_cntrl.probeToTCP.slave = ruby_system.network.master + + tcp_cntrl.responseToTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseToTCP.slave = ruby_system.network.master + + tcp_cntrl.mandatoryQueue = MessageBuffer() + + gpuCluster.add(tcp_cntrl) + + for i in xrange(options.num_sqc): + + sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + sqc_cntrl.create(options, ruby_system, system) + + exec("system.sqc_cntrl%d = sqc_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(sqc_cntrl.sequencer) + + # Connect the SQC controller to the ruby network + sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.requestFromSQC.master = ruby_system.network.slave + + sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseFromSQC.master = ruby_system.network.slave + + sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True) + sqc_cntrl.unblockFromCore.master = ruby_system.network.slave + + sqc_cntrl.probeToSQC = MessageBuffer(ordered = True) + sqc_cntrl.probeToSQC.slave = ruby_system.network.master + + sqc_cntrl.responseToSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseToSQC.slave = ruby_system.network.master + + sqc_cntrl.mandatoryQueue = MessageBuffer() + + # SQC also in GPU cluster + gpuCluster.add(sqc_cntrl) + + for i in xrange(options.numCPs): + + tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits, + number_of_TBEs = 2560) # max outstanding requests + tcp_cntrl.createCP(options, ruby_system, system) + + exec("system.tcp_cntrl%d = tcp_cntrl" % (options.num_compute_units + i)) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(tcp_cntrl.sequencer) + tcp_cntrl_nodes.append(tcp_cntrl) + + # Connect the TCP controller to the ruby network + tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.requestFromTCP.master = ruby_system.network.slave + + tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseFromTCP.master = ruby_system.network.slave + + tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True) + tcp_cntrl.unblockFromCore.master = ruby_system.network.slave + + tcp_cntrl.probeToTCP = MessageBuffer(ordered = True) + tcp_cntrl.probeToTCP.slave = ruby_system.network.master + + tcp_cntrl.responseToTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseToTCP.slave = ruby_system.network.master + + tcp_cntrl.mandatoryQueue = MessageBuffer() + + gpuCluster.add(tcp_cntrl) + + sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + sqc_cntrl.createCP(options, ruby_system, system) + + exec("system.sqc_cntrl%d = sqc_cntrl" % (options.num_compute_units + i)) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(sqc_cntrl.sequencer) + + # Connect the SQC controller to the ruby network + sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.requestFromSQC.master = ruby_system.network.slave + + sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseFromSQC.master = ruby_system.network.slave + + sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True) + sqc_cntrl.unblockFromCore.master = ruby_system.network.slave + + sqc_cntrl.probeToSQC = MessageBuffer(ordered = True) + sqc_cntrl.probeToSQC.slave = ruby_system.network.master + + sqc_cntrl.responseToSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseToSQC.slave = ruby_system.network.master + + sqc_cntrl.mandatoryQueue = MessageBuffer() + + # SQC also in GPU cluster + gpuCluster.add(sqc_cntrl) + + for i in xrange(options.num_tccs): + + tcc_cntrl = TCCCntrl(TCC_select_num_bits = TCC_bits, + number_of_TBEs = options.num_compute_units * 2560) + #Enough TBEs for all TCP TBEs + tcc_cntrl.create(options, ruby_system, system) + tcc_cntrl_nodes.append(tcc_cntrl) + + tccdir_cntrl = TCCDirCntrl(TCC_select_num_bits = TCC_bits, + number_of_TBEs = options.num_compute_units * 2560) + #Enough TBEs for all TCP TBEs + tccdir_cntrl.create(options, ruby_system, system) + tccdir_cntrl_nodes.append(tccdir_cntrl) + + exec("system.tcc_cntrl%d = tcc_cntrl" % i) + exec("system.tccdir_cntrl%d = tccdir_cntrl" % i) + + # connect all of the wire buffers between L3 and dirs up + req_to_tccdir = RubyWireBuffer() + resp_to_tccdir = RubyWireBuffer() + tcc_unblock_to_tccdir = RubyWireBuffer() + req_to_tcc = RubyWireBuffer() + probe_to_tcc = RubyWireBuffer() + resp_to_tcc = RubyWireBuffer() + + tcc_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir, + tcc_unblock_to_tccdir, req_to_tcc, + probe_to_tcc, resp_to_tcc) + tccdir_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir, + tcc_unblock_to_tccdir, req_to_tcc, + probe_to_tcc, resp_to_tcc) + + # Connect the TCC controller to the ruby network + tcc_cntrl.responseFromTCC = MessageBuffer(ordered = True) + tcc_cntrl.responseFromTCC.master = ruby_system.network.slave + + tcc_cntrl.responseToTCC = MessageBuffer(ordered = True) + tcc_cntrl.responseToTCC.slave = ruby_system.network.master + + # Connect the TCC Dir controller to the ruby network + tccdir_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tccdir_cntrl.requestFromTCP.slave = ruby_system.network.master + + tccdir_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tccdir_cntrl.responseFromTCP.slave = ruby_system.network.master + + tccdir_cntrl.unblockFromTCP = MessageBuffer(ordered = True) + tccdir_cntrl.unblockFromTCP.slave = ruby_system.network.master + + tccdir_cntrl.probeToCore = MessageBuffer(ordered = True) + tccdir_cntrl.probeToCore.master = ruby_system.network.slave + + tccdir_cntrl.responseToCore = MessageBuffer(ordered = True) + tccdir_cntrl.responseToCore.master = ruby_system.network.slave + + tccdir_cntrl.probeFromNB = MessageBuffer() + tccdir_cntrl.probeFromNB.slave = ruby_system.network.master + + tccdir_cntrl.responseFromNB = MessageBuffer() + tccdir_cntrl.responseFromNB.slave = ruby_system.network.master + + tccdir_cntrl.requestToNB = MessageBuffer() + tccdir_cntrl.requestToNB.master = ruby_system.network.slave + + tccdir_cntrl.responseToNB = MessageBuffer() + tccdir_cntrl.responseToNB.master = ruby_system.network.slave + + tccdir_cntrl.unblockToNB = MessageBuffer() + tccdir_cntrl.unblockToNB.master = ruby_system.network.slave + + tccdir_cntrl.triggerQueue = MessageBuffer(ordered = True) + + # TCC cntrls added to the GPU cluster + gpuCluster.add(tcc_cntrl) + gpuCluster.add(tccdir_cntrl) + + # Assuming no DMA devices + assert(len(dma_devices) == 0) + + # Add cpu/gpu clusters to main cluster + mainCluster.add(cpuCluster) + mainCluster.add(gpuCluster) + + ruby_system.network.number_of_virtual_networks = 10 + + return (cpu_sequencers, dir_cntrl_nodes, mainCluster) diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py new file mode 100644 index 000000000..f1384c404 --- /dev/null +++ b/configs/ruby/GPU_VIPER.py @@ -0,0 +1,674 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from Ruby import create_topology +from Ruby import send_evicts + +from Cluster import Cluster +from Crossbar import Crossbar + +class CntrlBase: + _seqs = 0 + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + CntrlBase._seqs += 1 + return CntrlBase._seqs - 1 + + _cntrls = 0 + @classmethod + def cntrlCount(cls): + # Use CntlCount not class since we need global count + CntrlBase._cntrls += 1 + return CntrlBase._cntrls - 1 + + _version = 0 + @classmethod + def versionCount(cls): + cls._version += 1 # Use count for this particular type + return cls._version - 1 + +class L1Cache(RubyCache): + resourceStalls = False + dataArrayBanks = 2 + tagArrayBanks = 2 + dataAccessLatency = 1 + tagAccessLatency = 1 + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L2Cache(RubyCache): + resourceStalls = False + assoc = 16 + dataArrayBanks = 16 + tagArrayBanks = 16 + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class CPCntrl(CorePair_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1Icache = L1Cache() + self.L1Icache.create(options.l1i_size, options.l1i_assoc, options) + self.L1D0cache = L1Cache() + self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options) + self.L1D1cache = L1Cache() + self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options) + self.L2cache = L2Cache() + self.L2cache.create(options.l2_size, options.l2_assoc, options) + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1Icache + self.sequencer.dcache = self.L1D0cache + self.sequencer.ruby_system = ruby_system + self.sequencer.coreid = 0 + self.sequencer.is_cpu_sequencer = True + + self.sequencer1 = RubySequencer() + self.sequencer1.version = self.seqCount() + self.sequencer1.icache = self.L1Icache + self.sequencer1.dcache = self.L1D1cache + self.sequencer1.ruby_system = ruby_system + self.sequencer1.coreid = 1 + self.sequencer1.is_cpu_sequencer = True + + self.issue_latency = options.cpu_to_dir_latency + self.send_evictions = send_evicts(options) + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCPCache(RubyCache): + size = "16kB" + assoc = 16 + dataArrayBanks = 16 #number of data banks + tagArrayBanks = 16 #number of tag banks + dataAccessLatency = 4 + tagAccessLatency = 1 + def create(self, options): + self.size = MemorySize(options.tcp_size) + self.assoc = options.tcp_assoc + self.resourceStalls = options.no_tcc_resource_stalls + self.replacement_policy = PseudoLRUReplacementPolicy() + +class TCPCntrl(TCP_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency, + dataAccessLatency = options.TCP_latency) + self.L1cache.resourceStalls = options.no_resource_stalls + self.L1cache.create(options) + self.issue_latency = 1 + + self.coalescer = VIPERCoalescer() + self.coalescer.version = self.seqCount() + self.coalescer.icache = self.L1cache + self.coalescer.dcache = self.L1cache + self.coalescer.ruby_system = ruby_system + self.coalescer.support_inst_reqs = False + self.coalescer.is_cpu_sequencer = False + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.is_cpu_sequencer = True + + self.use_seq_not_coal = False + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def createCP(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency, + dataAccessLatency = options.TCP_latency) + self.L1cache.resourceStalls = options.no_resource_stalls + self.L1cache.create(options) + self.issue_latency = 1 + + self.coalescer = VIPERCoalescer() + self.coalescer.version = self.seqCount() + self.coalescer.icache = self.L1cache + self.coalescer.dcache = self.L1cache + self.coalescer.ruby_system = ruby_system + self.coalescer.support_inst_reqs = False + self.coalescer.is_cpu_sequencer = False + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.is_cpu_sequencer = True + + self.use_seq_not_coal = True + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class SQCCache(RubyCache): + dataArrayBanks = 8 + tagArrayBanks = 8 + dataAccessLatency = 1 + tagAccessLatency = 1 + + def create(self, options): + self.size = MemorySize(options.sqc_size) + self.assoc = options.sqc_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class SQCCntrl(SQC_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1cache = SQCCache() + self.L1cache.create(options) + self.L1cache.resourceStalls = options.no_resource_stalls + + self.sequencer = RubySequencer() + + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.support_data_reqs = False + self.sequencer.is_cpu_sequencer = False + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCC(RubyCache): + size = MemorySize("256kB") + assoc = 16 + dataAccessLatency = 8 + tagAccessLatency = 2 + resourceStalls = True + def create(self, options): + self.assoc = options.tcc_assoc + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: + s = options.num_compute_units + tcc_size = s * 128 + tcc_size = str(tcc_size)+'kB' + self.size = MemorySize(tcc_size) + self.dataArrayBanks = 64 + self.tagArrayBanks = 64 + else: + self.size = MemorySize(options.tcc_size) + self.dataArrayBanks = 256 / options.num_tccs #number of data banks + self.tagArrayBanks = 256 / options.num_tccs #number of tag banks + self.size.value = self.size.value / options.num_tccs + if ((self.size.value / long(self.assoc)) < 128): + self.size.value = long(128 * self.assoc) + self.start_index_bit = math.log(options.cacheline_size, 2) + \ + math.log(options.num_tccs, 2) + self.replacement_policy = PseudoLRUReplacementPolicy() + + +class TCCCntrl(TCC_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L2cache = TCC() + self.L2cache.create(options) + self.L2cache.resourceStalls = options.no_tcc_resource_stalls + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class L3Cache(RubyCache): + dataArrayBanks = 16 + tagArrayBanks = 16 + + def create(self, options, ruby_system, system): + self.size = MemorySize(options.l3_size) + self.size.value /= options.num_dirs + self.assoc = options.l3_assoc + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataAccessLatency = options.l3_data_latency + self.tagAccessLatency = options.l3_tag_latency + self.resourceStalls = False + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L3Cntrl(L3Cache_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L3cache = L3Cache() + self.L3cache.create(options, ruby_system, system) + + self.l3_response_latency = max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency) + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +class DirMem(RubyDirectoryMemory, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + phys_mem_size = AddrRange(options.mem_size).size() + mem_module_size = phys_mem_size / options.num_dirs + dir_size = MemorySize('0B') + dir_size.value = mem_module_size + self.size = dir_size + +class DirCntrl(Directory_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.response_latency = 30 + + self.directory = DirMem() + self.directory.create(options, ruby_system, system) + + self.L3CacheMemory = L3Cache() + self.L3CacheMemory.create(options, ruby_system, system) + + self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency, + self.L3CacheMemory.tagAccessLatency) + + self.number_of_TBEs = options.num_tbes + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +def define_options(parser): + parser.add_option("--num-subcaches", type = "int", default = 4) + parser.add_option("--l3-data-latency", type = "int", default = 20) + parser.add_option("--l3-tag-latency", type = "int", default = 15) + parser.add_option("--cpu-to-dir-latency", type = "int", default = 120) + parser.add_option("--gpu-to-dir-latency", type = "int", default = 120) + parser.add_option("--no-resource-stalls", action = "store_false", + default = True) + parser.add_option("--no-tcc-resource-stalls", action = "store_false", + default = True) + parser.add_option("--use-L3-on-WT", action = "store_true", default = False) + parser.add_option("--num-tbes", type = "int", default = 256) + parser.add_option("--l2-latency", type = "int", default = 50) # load to use + parser.add_option("--num-tccs", type = "int", default = 1, + help = "number of TCC banks in the GPU") + parser.add_option("--sqc-size", type = 'string', default = '32kB', + help = "SQC cache size") + parser.add_option("--sqc-assoc", type = 'int', default = 8, + help = "SQC cache assoc") + parser.add_option("--WB_L1", action = "store_true", default = False, + help = "writeback L1") + parser.add_option("--WB_L2", action = "store_true", default = False, + help = "writeback L2") + parser.add_option("--TCP_latency", type = "int", default = 4, + help = "TCP latency") + parser.add_option("--TCC_latency", type = "int", default = 16, + help = "TCC latency") + parser.add_option("--tcc-size", type = 'string', default = '256kB', + help = "agregate tcc size") + parser.add_option("--tcc-assoc", type = 'int', default = 16, + help = "tcc assoc") + parser.add_option("--tcp-size", type = 'string', default = '16kB', + help = "tcp size") + parser.add_option("--tcp-assoc", type = 'int', default = 16, + help = "tcp assoc") + parser.add_option("--noL1", action = "store_true", default = False, + help = "bypassL1") + +def create_system(options, full_system, system, dma_devices, ruby_system): + if buildEnv['PROTOCOL'] != 'GPU_VIPER': + panic("This script requires the GPU_VIPER protocol to be built.") + + cpu_sequencers = [] + + # + # The ruby network creation expects the list of nodes in the system to be + # consistent with the NetDest list. Therefore the l1 controller nodes + # must be listed before the directory nodes and directory nodes before + # dma nodes, etc. + # + cp_cntrl_nodes = [] + tcp_cntrl_nodes = [] + sqc_cntrl_nodes = [] + tcc_cntrl_nodes = [] + dir_cntrl_nodes = [] + l3_cntrl_nodes = [] + + # + # Must create the individual controllers before the network to ensure the + # controller constructors are called before the network constructor + # + + # For an odd number of CPUs, still create the right number of controllers + TCC_bits = int(math.log(options.num_tccs, 2)) + + # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu + # Clusters + crossbar_bw = None + mainCluster = None + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: + #Assuming a 2GHz clock + crossbar_bw = 16 * options.num_compute_units * options.bw_scalor + mainCluster = Cluster(intBW=crossbar_bw) + else: + mainCluster = Cluster(intBW=8) # 16 GB/s + for i in xrange(options.num_dirs): + + dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits) + dir_cntrl.create(options, ruby_system, system) + dir_cntrl.number_of_TBEs = options.num_tbes + dir_cntrl.useL3OnWT = options.use_L3_on_WT + # the number_of_TBEs is inclusive of TBEs below + + # Connect the Directory controller to the ruby network + dir_cntrl.requestFromCores = MessageBuffer(ordered = True) + dir_cntrl.requestFromCores.slave = ruby_system.network.master + + dir_cntrl.responseFromCores = MessageBuffer() + dir_cntrl.responseFromCores.slave = ruby_system.network.master + + dir_cntrl.unblockFromCores = MessageBuffer() + dir_cntrl.unblockFromCores.slave = ruby_system.network.master + + dir_cntrl.probeToCore = MessageBuffer() + dir_cntrl.probeToCore.master = ruby_system.network.slave + + dir_cntrl.responseToCore = MessageBuffer() + dir_cntrl.responseToCore.master = ruby_system.network.slave + + dir_cntrl.triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.responseFromMemory = MessageBuffer() + + exec("ruby_system.dir_cntrl%d = dir_cntrl" % i) + dir_cntrl_nodes.append(dir_cntrl) + + mainCluster.add(dir_cntrl) + + cpuCluster = None + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: + cpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw) + else: + cpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s + for i in xrange((options.num_cpus + 1) / 2): + + cp_cntrl = CPCntrl() + cp_cntrl.create(options, ruby_system, system) + + exec("ruby_system.cp_cntrl%d = cp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1]) + + # Connect the CP controllers and the network + cp_cntrl.requestFromCore = MessageBuffer() + cp_cntrl.requestFromCore.master = ruby_system.network.slave + + cp_cntrl.responseFromCore = MessageBuffer() + cp_cntrl.responseFromCore.master = ruby_system.network.slave + + cp_cntrl.unblockFromCore = MessageBuffer() + cp_cntrl.unblockFromCore.master = ruby_system.network.slave + + cp_cntrl.probeToCore = MessageBuffer() + cp_cntrl.probeToCore.slave = ruby_system.network.master + + cp_cntrl.responseToCore = MessageBuffer() + cp_cntrl.responseToCore.slave = ruby_system.network.master + + cp_cntrl.mandatoryQueue = MessageBuffer() + cp_cntrl.triggerQueue = MessageBuffer(ordered = True) + + cpuCluster.add(cp_cntrl) + + gpuCluster = None + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: + gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw) + else: + gpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s + for i in xrange(options.num_compute_units): + + tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits, + issue_latency = 1, + number_of_TBEs = 2560) + # TBEs set to max outstanding requests + tcp_cntrl.create(options, ruby_system, system) + tcp_cntrl.WB = options.WB_L1 + tcp_cntrl.disableL1 = options.noL1 + tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency + tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency + + exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(tcp_cntrl.coalescer) + tcp_cntrl_nodes.append(tcp_cntrl) + + # Connect the TCP controller to the ruby network + tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.requestFromTCP.master = ruby_system.network.slave + + tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseFromTCP.master = ruby_system.network.slave + + tcp_cntrl.unblockFromCore = MessageBuffer() + tcp_cntrl.unblockFromCore.master = ruby_system.network.slave + + tcp_cntrl.probeToTCP = MessageBuffer(ordered = True) + tcp_cntrl.probeToTCP.slave = ruby_system.network.master + + tcp_cntrl.responseToTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseToTCP.slave = ruby_system.network.master + + tcp_cntrl.mandatoryQueue = MessageBuffer() + + gpuCluster.add(tcp_cntrl) + + for i in xrange(options.num_sqc): + + sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + sqc_cntrl.create(options, ruby_system, system) + + exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(sqc_cntrl.sequencer) + + # Connect the SQC controller to the ruby network + sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.requestFromSQC.master = ruby_system.network.slave + + sqc_cntrl.probeToSQC = MessageBuffer(ordered = True) + sqc_cntrl.probeToSQC.slave = ruby_system.network.master + + sqc_cntrl.responseToSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseToSQC.slave = ruby_system.network.master + + sqc_cntrl.mandatoryQueue = MessageBuffer() + + # SQC also in GPU cluster + gpuCluster.add(sqc_cntrl) + + for i in xrange(options.numCPs): + + tcp_ID = options.num_compute_units + i + sqc_ID = options.num_sqc + i + + tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits, + issue_latency = 1, + number_of_TBEs = 2560) + # TBEs set to max outstanding requests + tcp_cntrl.createCP(options, ruby_system, system) + tcp_cntrl.WB = options.WB_L1 + tcp_cntrl.disableL1 = options.noL1 + tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency + tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency + + exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % tcp_ID) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(tcp_cntrl.sequencer) + tcp_cntrl_nodes.append(tcp_cntrl) + + # Connect the CP (TCP) controllers to the ruby network + tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.requestFromTCP.master = ruby_system.network.slave + + tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseFromTCP.master = ruby_system.network.slave + + tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True) + tcp_cntrl.unblockFromCore.master = ruby_system.network.slave + + tcp_cntrl.probeToTCP = MessageBuffer(ordered = True) + tcp_cntrl.probeToTCP.slave = ruby_system.network.master + + tcp_cntrl.responseToTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseToTCP.slave = ruby_system.network.master + + tcp_cntrl.mandatoryQueue = MessageBuffer() + + gpuCluster.add(tcp_cntrl) + + sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + sqc_cntrl.create(options, ruby_system, system) + + exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % sqc_ID) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(sqc_cntrl.sequencer) + + # SQC also in GPU cluster + gpuCluster.add(sqc_cntrl) + + for i in xrange(options.num_tccs): + + tcc_cntrl = TCCCntrl(l2_response_latency = options.TCC_latency) + tcc_cntrl.create(options, ruby_system, system) + tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency + tcc_cntrl.l2_response_latency = options.TCC_latency + tcc_cntrl_nodes.append(tcc_cntrl) + tcc_cntrl.WB = options.WB_L2 + tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units + # the number_of_TBEs is inclusive of TBEs below + + # Connect the TCC controllers to the ruby network + tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcc_cntrl.requestFromTCP.slave = ruby_system.network.master + + tcc_cntrl.responseToCore = MessageBuffer(ordered = True) + tcc_cntrl.responseToCore.master = ruby_system.network.slave + + tcc_cntrl.probeFromNB = MessageBuffer() + tcc_cntrl.probeFromNB.slave = ruby_system.network.master + + tcc_cntrl.responseFromNB = MessageBuffer() + tcc_cntrl.responseFromNB.slave = ruby_system.network.master + + tcc_cntrl.requestToNB = MessageBuffer(ordered = True) + tcc_cntrl.requestToNB.master = ruby_system.network.slave + + tcc_cntrl.responseToNB = MessageBuffer() + tcc_cntrl.responseToNB.master = ruby_system.network.slave + + tcc_cntrl.unblockToNB = MessageBuffer() + tcc_cntrl.unblockToNB.master = ruby_system.network.slave + + tcc_cntrl.triggerQueue = MessageBuffer(ordered = True) + + exec("ruby_system.tcc_cntrl%d = tcc_cntrl" % i) + + # connect all of the wire buffers between L3 and dirs up + # TCC cntrls added to the GPU cluster + gpuCluster.add(tcc_cntrl) + + # Assuming no DMA devices + assert(len(dma_devices) == 0) + + # Add cpu/gpu clusters to main cluster + mainCluster.add(cpuCluster) + mainCluster.add(gpuCluster) + + ruby_system.network.number_of_virtual_networks = 10 + + return (cpu_sequencers, dir_cntrl_nodes, mainCluster) diff --git a/configs/ruby/GPU_VIPER_Baseline.py b/configs/ruby/GPU_VIPER_Baseline.py new file mode 100644 index 000000000..879b34e88 --- /dev/null +++ b/configs/ruby/GPU_VIPER_Baseline.py @@ -0,0 +1,588 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Sooraj Puthoor +# + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from Ruby import create_topology +from Ruby import send_evicts + +from Cluster import Cluster +from Crossbar import Crossbar + +class CntrlBase: + _seqs = 0 + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + CntrlBase._seqs += 1 + return CntrlBase._seqs - 1 + + _cntrls = 0 + @classmethod + def cntrlCount(cls): + # Use CntlCount not class since we need global count + CntrlBase._cntrls += 1 + return CntrlBase._cntrls - 1 + + _version = 0 + @classmethod + def versionCount(cls): + cls._version += 1 # Use count for this particular type + return cls._version - 1 + +class L1Cache(RubyCache): + resourceStalls = False + dataArrayBanks = 2 + tagArrayBanks = 2 + dataAccessLatency = 1 + tagAccessLatency = 1 + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L2Cache(RubyCache): + resourceStalls = False + assoc = 16 + dataArrayBanks = 16 + tagArrayBanks = 16 + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class CPCntrl(CorePair_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1Icache = L1Cache() + self.L1Icache.create(options.l1i_size, options.l1i_assoc, options) + self.L1D0cache = L1Cache() + self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options) + self.L1D1cache = L1Cache() + self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options) + self.L2cache = L2Cache() + self.L2cache.create(options.l2_size, options.l2_assoc, options) + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1Icache + self.sequencer.dcache = self.L1D0cache + self.sequencer.ruby_system = ruby_system + self.sequencer.coreid = 0 + self.sequencer.is_cpu_sequencer = True + + self.sequencer1 = RubySequencer() + self.sequencer1.version = self.seqCount() + self.sequencer1.icache = self.L1Icache + self.sequencer1.dcache = self.L1D1cache + self.sequencer1.ruby_system = ruby_system + self.sequencer1.coreid = 1 + self.sequencer1.is_cpu_sequencer = True + + self.issue_latency = options.cpu_to_dir_latency + self.send_evictions = send_evicts(options) + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCPCache(RubyCache): + size = "16kB" + assoc = 16 + dataArrayBanks = 16 + tagArrayBanks = 16 + dataAccessLatency = 4 + tagAccessLatency = 1 + def create(self, options): + self.size = MemorySize(options.tcp_size) + self.dataArrayBanks = 16 + self.tagArrayBanks = 16 + self.dataAccessLatency = 4 + self.tagAccessLatency = 1 + self.resourceStalls = options.no_tcc_resource_stalls + self.replacement_policy = PseudoLRUReplacementPolicy() + +class TCPCntrl(TCP_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L1cache = TCPCache() + self.L1cache.create(options) + self.issue_latency = 1 + + self.coalescer = VIPERCoalescer() + self.coalescer.version = self.seqCount() + self.coalescer.icache = self.L1cache + self.coalescer.dcache = self.L1cache + self.coalescer.ruby_system = ruby_system + self.coalescer.support_inst_reqs = False + self.coalescer.is_cpu_sequencer = False + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.is_cpu_sequencer = True + + self.use_seq_not_coal = False + + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class SQCCache(RubyCache): + dataArrayBanks = 8 + tagArrayBanks = 8 + dataAccessLatency = 1 + tagAccessLatency = 1 + + def create(self, options): + self.size = MemorySize(options.sqc_size) + self.assoc = options.sqc_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class SQCCntrl(SQC_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L1cache = SQCCache() + self.L1cache.create(options) + self.L1cache.resourceStalls = False + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.support_data_reqs = False + self.sequencer.is_cpu_sequencer = False + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCC(RubyCache): + size = MemorySize("256kB") + assoc = 16 + dataAccessLatency = 8 + tagAccessLatency = 2 + resourceStalls = True + def create(self, options): + self.assoc = options.tcc_assoc + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: + s = options.num_compute_units + tcc_size = s * 128 + tcc_size = str(tcc_size)+'kB' + self.size = MemorySize(tcc_size) + self.dataArrayBanks = 64 + self.tagArrayBanks = 64 + else: + self.size = MemorySize(options.tcc_size) + self.dataArrayBanks = 256 / options.num_tccs #number of data banks + self.tagArrayBanks = 256 / options.num_tccs #number of tag banks + self.size.value = self.size.value / options.num_tccs + if ((self.size.value / long(self.assoc)) < 128): + self.size.value = long(128 * self.assoc) + self.start_index_bit = math.log(options.cacheline_size, 2) + \ + math.log(options.num_tccs, 2) + self.replacement_policy = PseudoLRUReplacementPolicy() + +class TCCCntrl(TCC_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L2cache = TCC() + self.L2cache.create(options) + self.ruby_system = ruby_system + self.L2cache.resourceStalls = options.no_tcc_resource_stalls + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class L3Cache(RubyCache): + dataArrayBanks = 16 + tagArrayBanks = 16 + + def create(self, options, ruby_system, system): + self.size = MemorySize(options.l3_size) + self.size.value /= options.num_dirs + self.assoc = options.l3_assoc + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataAccessLatency = options.l3_data_latency + self.tagAccessLatency = options.l3_tag_latency + self.resourceStalls = False + self.replacement_policy = PseudoLRUReplacementPolicy() + +class ProbeFilter(RubyCache): + size = "4MB" + assoc = 16 + dataArrayBanks = 256 + tagArrayBanks = 256 + + def create(self, options, ruby_system, system): + self.block_size = "%dB" % (64 * options.blocks_per_region) + self.size = options.region_dir_entries * \ + self.block_size * options.num_compute_units + self.assoc = 8 + self.tagArrayBanks = 8 + self.tagAccessLatency = options.dir_tag_latency + self.dataAccessLatency = 1 + self.resourceStalls = options.no_resource_stalls + self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2)) + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L3Cntrl(L3Cache_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L3cache = L3Cache() + self.L3cache.create(options, ruby_system, system) + self.l3_response_latency = \ + max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency) + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +class DirMem(RubyDirectoryMemory, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + phys_mem_size = AddrRange(options.mem_size).size() + mem_module_size = phys_mem_size / options.num_dirs + dir_size = MemorySize('0B') + dir_size.value = mem_module_size + self.size = dir_size + +class DirCntrl(Directory_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.response_latency = 30 + self.directory = DirMem() + self.directory.create(options, ruby_system, system) + self.L3CacheMemory = L3Cache() + self.L3CacheMemory.create(options, ruby_system, system) + self.ProbeFilterMemory = ProbeFilter() + self.ProbeFilterMemory.create(options, ruby_system, system) + self.l3_hit_latency = \ + max(self.L3CacheMemory.dataAccessLatency, + self.L3CacheMemory.tagAccessLatency) + + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +def define_options(parser): + parser.add_option("--num-subcaches", type = "int", default = 4) + parser.add_option("--l3-data-latency", type = "int", default = 20) + parser.add_option("--l3-tag-latency", type = "int", default = 15) + parser.add_option("--cpu-to-dir-latency", type = "int", default = 120) + parser.add_option("--gpu-to-dir-latency", type = "int", default = 120) + parser.add_option("--no-resource-stalls", action = "store_false", + default = True) + parser.add_option("--no-tcc-resource-stalls", action = "store_false", + default = True) + parser.add_option("--num-tbes", type = "int", default = 2560) + parser.add_option("--l2-latency", type = "int", default = 50) # load to use + parser.add_option("--num-tccs", type = "int", default = 1, + help = "number of TCC banks in the GPU") + parser.add_option("--sqc-size", type = 'string', default = '32kB', + help = "SQC cache size") + parser.add_option("--sqc-assoc", type = 'int', default = 8, + help = "SQC cache assoc") + parser.add_option("--region-dir-entries", type = "int", default = 8192) + parser.add_option("--dir-tag-latency", type = "int", default = 8) + parser.add_option("--dir-tag-banks", type = "int", default = 4) + parser.add_option("--blocks-per-region", type = "int", default = 1) + parser.add_option("--use-L3-on-WT", action = "store_true", default = False) + parser.add_option("--nonInclusiveDir", action = "store_true", + default = False) + parser.add_option("--WB_L1", action = "store_true", + default = False, help = "writeback L2") + parser.add_option("--WB_L2", action = "store_true", + default = False, help = "writeback L2") + parser.add_option("--TCP_latency", type = "int", + default = 4, help = "TCP latency") + parser.add_option("--TCC_latency", type = "int", + default = 16, help = "TCC latency") + parser.add_option("--tcc-size", type = 'string', default = '2MB', + help = "agregate tcc size") + parser.add_option("--tcc-assoc", type = 'int', default = 16, + help = "tcc assoc") + parser.add_option("--tcp-size", type = 'string', default = '16kB', + help = "tcp size") + parser.add_option("--sampler-sets", type = "int", default = 1024) + parser.add_option("--sampler-assoc", type = "int", default = 16) + parser.add_option("--sampler-counter", type = "int", default = 512) + parser.add_option("--noL1", action = "store_true", default = False, + help = "bypassL1") + parser.add_option("--noL2", action = "store_true", default = False, + help = "bypassL2") + +def create_system(options, full_system, system, dma_devices, ruby_system): + if buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline': + panic("This script requires the" \ + "GPU_VIPER_Baseline protocol to be built.") + + cpu_sequencers = [] + + # + # The ruby network creation expects the list of nodes in the system to be + # consistent with the NetDest list. Therefore the l1 controller nodes + # must be listed before the directory nodes and directory nodes before + # dma nodes, etc. + # + cp_cntrl_nodes = [] + tcp_cntrl_nodes = [] + sqc_cntrl_nodes = [] + tcc_cntrl_nodes = [] + dir_cntrl_nodes = [] + l3_cntrl_nodes = [] + + # + # Must create the individual controllers before the network to ensure the + # controller constructors are called before the network constructor + # + + # For an odd number of CPUs, still create the right number of controllers + TCC_bits = int(math.log(options.num_tccs, 2)) + + # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu + # Clusters + crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock + mainCluster = Cluster(intBW = crossbar_bw) + for i in xrange(options.num_dirs): + + dir_cntrl = DirCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits) + dir_cntrl.create(options, ruby_system, system) + dir_cntrl.number_of_TBEs = options.num_tbes + dir_cntrl.useL3OnWT = options.use_L3_on_WT + dir_cntrl.inclusiveDir = not options.nonInclusiveDir + + # Connect the Directory controller to the ruby network + dir_cntrl.requestFromCores = MessageBuffer(ordered = True) + dir_cntrl.requestFromCores.slave = ruby_system.network.master + + dir_cntrl.responseFromCores = MessageBuffer() + dir_cntrl.responseFromCores.slave = ruby_system.network.master + + dir_cntrl.unblockFromCores = MessageBuffer() + dir_cntrl.unblockFromCores.slave = ruby_system.network.master + + dir_cntrl.probeToCore = MessageBuffer() + dir_cntrl.probeToCore.master = ruby_system.network.slave + + dir_cntrl.responseToCore = MessageBuffer() + dir_cntrl.responseToCore.master = ruby_system.network.slave + + dir_cntrl.triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.responseFromMemory = MessageBuffer() + + exec("system.dir_cntrl%d = dir_cntrl" % i) + dir_cntrl_nodes.append(dir_cntrl) + mainCluster.add(dir_cntrl) + + cpuCluster = Cluster(extBW = crossbar_bw, intBW=crossbar_bw) + for i in xrange((options.num_cpus + 1) / 2): + + cp_cntrl = CPCntrl() + cp_cntrl.create(options, ruby_system, system) + + exec("system.cp_cntrl%d = cp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1]) + + # Connect the CP controllers and the network + cp_cntrl.requestFromCore = MessageBuffer() + cp_cntrl.requestFromCore.master = ruby_system.network.slave + + cp_cntrl.responseFromCore = MessageBuffer() + cp_cntrl.responseFromCore.master = ruby_system.network.slave + + cp_cntrl.unblockFromCore = MessageBuffer() + cp_cntrl.unblockFromCore.master = ruby_system.network.slave + + cp_cntrl.probeToCore = MessageBuffer() + cp_cntrl.probeToCore.slave = ruby_system.network.master + + cp_cntrl.responseToCore = MessageBuffer() + cp_cntrl.responseToCore.slave = ruby_system.network.master + + cp_cntrl.mandatoryQueue = MessageBuffer() + cp_cntrl.triggerQueue = MessageBuffer(ordered = True) + + cpuCluster.add(cp_cntrl) + + gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw) + for i in xrange(options.num_compute_units): + + tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits, + issue_latency = 1, + number_of_TBEs = 2560) + # TBEs set to max outstanding requests + tcp_cntrl.create(options, ruby_system, system) + tcp_cntrl.WB = options.WB_L1 + tcp_cntrl.disableL1 = options.noL1 + + exec("system.tcp_cntrl%d = tcp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(tcp_cntrl.coalescer) + tcp_cntrl_nodes.append(tcp_cntrl) + + # Connect the CP (TCP) controllers to the ruby network + tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.requestFromTCP.master = ruby_system.network.slave + + tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseFromTCP.master = ruby_system.network.slave + + tcp_cntrl.unblockFromCore = MessageBuffer() + tcp_cntrl.unblockFromCore.master = ruby_system.network.slave + + tcp_cntrl.probeToTCP = MessageBuffer(ordered = True) + tcp_cntrl.probeToTCP.slave = ruby_system.network.master + + tcp_cntrl.responseToTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseToTCP.slave = ruby_system.network.master + + tcp_cntrl.mandatoryQueue = MessageBuffer() + + gpuCluster.add(tcp_cntrl) + + for i in xrange(options.num_sqc): + + sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + sqc_cntrl.create(options, ruby_system, system) + + exec("system.sqc_cntrl%d = sqc_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(sqc_cntrl.sequencer) + + # Connect the SQC controller to the ruby network + sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.requestFromSQC.master = ruby_system.network.slave + + sqc_cntrl.probeToSQC = MessageBuffer(ordered = True) + sqc_cntrl.probeToSQC.slave = ruby_system.network.master + + sqc_cntrl.responseToSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseToSQC.slave = ruby_system.network.master + + sqc_cntrl.mandatoryQueue = MessageBuffer() + + # SQC also in GPU cluster + gpuCluster.add(sqc_cntrl) + + # Because of wire buffers, num_tccs must equal num_tccdirs + numa_bit = 6 + + for i in xrange(options.num_tccs): + + tcc_cntrl = TCCCntrl() + tcc_cntrl.create(options, ruby_system, system) + tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency + tcc_cntrl.l2_response_latency = options.TCC_latency + tcc_cntrl_nodes.append(tcc_cntrl) + tcc_cntrl.WB = options.WB_L2 + tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units + + # Connect the TCC controllers to the ruby network + tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcc_cntrl.requestFromTCP.slave = ruby_system.network.master + + tcc_cntrl.responseToCore = MessageBuffer(ordered = True) + tcc_cntrl.responseToCore.master = ruby_system.network.slave + + tcc_cntrl.probeFromNB = MessageBuffer() + tcc_cntrl.probeFromNB.slave = ruby_system.network.master + + tcc_cntrl.responseFromNB = MessageBuffer() + tcc_cntrl.responseFromNB.slave = ruby_system.network.master + + tcc_cntrl.requestToNB = MessageBuffer(ordered = True) + tcc_cntrl.requestToNB.master = ruby_system.network.slave + + tcc_cntrl.responseToNB = MessageBuffer() + tcc_cntrl.responseToNB.master = ruby_system.network.slave + + tcc_cntrl.unblockToNB = MessageBuffer() + tcc_cntrl.unblockToNB.master = ruby_system.network.slave + + tcc_cntrl.triggerQueue = MessageBuffer(ordered = True) + + exec("system.tcc_cntrl%d = tcc_cntrl" % i) + # connect all of the wire buffers between L3 and dirs up + # TCC cntrls added to the GPU cluster + gpuCluster.add(tcc_cntrl) + + # Assuming no DMA devices + assert(len(dma_devices) == 0) + + # Add cpu/gpu clusters to main cluster + mainCluster.add(cpuCluster) + mainCluster.add(gpuCluster) + + ruby_system.network.number_of_virtual_networks = 10 + + return (cpu_sequencers, dir_cntrl_nodes, mainCluster) diff --git a/configs/ruby/GPU_VIPER_Region.py b/configs/ruby/GPU_VIPER_Region.py new file mode 100644 index 000000000..94cb9b70b --- /dev/null +++ b/configs/ruby/GPU_VIPER_Region.py @@ -0,0 +1,758 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Sooraj Puthoor +# + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from Ruby import send_evicts + +from Cluster import Cluster + +class CntrlBase: + _seqs = 0 + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + CntrlBase._seqs += 1 + return CntrlBase._seqs - 1 + + _cntrls = 0 + @classmethod + def cntrlCount(cls): + # Use CntlCount not class since we need global count + CntrlBase._cntrls += 1 + return CntrlBase._cntrls - 1 + + _version = 0 + @classmethod + def versionCount(cls): + cls._version += 1 # Use count for this particular type + return cls._version - 1 + +# +# Note: the L1 Cache latency is only used by the sequencer on fast path hits +# +class L1Cache(RubyCache): + resourceStalls = False + dataArrayBanks = 2 + tagArrayBanks = 2 + dataAccessLatency = 1 + tagAccessLatency = 1 + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L2Cache(RubyCache): + resourceStalls = False + assoc = 16 + dataArrayBanks = 16 + tagArrayBanks = 16 + def create(self, size, assoc, options): + self.size = MemorySize(size) + self.assoc = assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class CPCntrl(CorePair_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1Icache = L1Cache() + self.L1Icache.create(options.l1i_size, options.l1i_assoc, options) + self.L1D0cache = L1Cache() + self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options) + self.L1D1cache = L1Cache() + self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options) + self.L2cache = L2Cache() + self.L2cache.create(options.l2_size, options.l2_assoc, options) + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1Icache + self.sequencer.dcache = self.L1D0cache + self.sequencer.ruby_system = ruby_system + self.sequencer.coreid = 0 + self.sequencer.is_cpu_sequencer = True + + self.sequencer1 = RubySequencer() + self.sequencer1.version = self.seqCount() + self.sequencer1.icache = self.L1Icache + self.sequencer1.dcache = self.L1D1cache + self.sequencer1.ruby_system = ruby_system + self.sequencer1.coreid = 1 + self.sequencer1.is_cpu_sequencer = True + + self.issue_latency = 1 + self.send_evictions = send_evicts(options) + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCPCache(RubyCache): + size = "16kB" + assoc = 16 + dataArrayBanks = 16 + tagArrayBanks = 16 + dataAccessLatency = 4 + tagAccessLatency = 1 + def create(self, options): + self.size = MemorySize(options.tcp_size) + self.dataArrayBanks = 16 + self.tagArrayBanks = 16 + self.dataAccessLatency = 4 + self.tagAccessLatency = 1 + self.resourceStalls = options.no_tcc_resource_stalls + self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc) + +class TCPCntrl(TCP_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L1cache = TCPCache(dataAccessLatency = options.TCP_latency) + self.L1cache.create(options) + self.issue_latency = 1 + + self.coalescer = VIPERCoalescer() + self.coalescer.version = self.seqCount() + self.coalescer.icache = self.L1cache + self.coalescer.dcache = self.L1cache + self.coalescer.ruby_system = ruby_system + self.coalescer.support_inst_reqs = False + self.coalescer.is_cpu_sequencer = False + + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.is_cpu_sequencer = True + + self.use_seq_not_coal = False + + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class SQCCache(RubyCache): + dataArrayBanks = 8 + tagArrayBanks = 8 + dataAccessLatency = 1 + tagAccessLatency = 1 + + def create(self, options): + self.size = MemorySize(options.sqc_size) + self.assoc = options.sqc_assoc + self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc) + +class SQCCntrl(SQC_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L1cache = SQCCache() + self.L1cache.create(options) + self.L1cache.resourceStalls = False + self.sequencer = RubySequencer() + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1cache + self.sequencer.dcache = self.L1cache + self.sequencer.ruby_system = ruby_system + self.sequencer.support_data_reqs = False + self.sequencer.is_cpu_sequencer = False + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class TCC(RubyCache): + size = MemorySize("256kB") + assoc = 16 + dataAccessLatency = 8 + tagAccessLatency = 2 + resourceStalls = False + def create(self, options): + self.assoc = options.tcc_assoc + if hasattr(options, 'bw_scalor') and options.bw_scalor > 0: + s = options.num_compute_units + tcc_size = s * 128 + tcc_size = str(tcc_size)+'kB' + self.size = MemorySize(tcc_size) + self.dataArrayBanks = 64 + self.tagArrayBanks = 64 + else: + self.size = MemorySize(options.tcc_size) + self.dataArrayBanks = 256 / options.num_tccs #number of data banks + self.tagArrayBanks = 256 / options.num_tccs #number of tag banks + self.size.value = self.size.value / options.num_tccs + if ((self.size.value / long(self.assoc)) < 128): + self.size.value = long(128 * self.assoc) + self.start_index_bit = math.log(options.cacheline_size, 2) + \ + math.log(options.num_tccs, 2) + self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc) + +class TCCCntrl(TCC_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L2cache = TCC() + self.L2cache.create(options) + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class L3Cache(RubyCache): + dataArrayBanks = 16 + tagArrayBanks = 16 + + def create(self, options, ruby_system, system): + self.size = MemorySize(options.l3_size) + self.size.value /= options.num_dirs + self.assoc = options.l3_assoc + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataAccessLatency = options.l3_data_latency + self.tagAccessLatency = options.l3_tag_latency + self.resourceStalls = False + self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc) + +class L3Cntrl(L3Cache_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L3cache = L3Cache() + self.L3cache.create(options, ruby_system, system) + self.l3_response_latency = \ + max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency) + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +# Directory memory: Directory memory of infinite size which is +# used by directory controller to store the "states" of the +# state machine. The state machine is implemented per cache block +class DirMem(RubyDirectoryMemory, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + phys_mem_size = AddrRange(options.mem_size).size() + mem_module_size = phys_mem_size / options.num_dirs + dir_size = MemorySize('0B') + dir_size.value = mem_module_size + self.size = dir_size + +# Directory controller: Contains directory memory, L3 cache and associated state +# machine which is used to accurately redirect a data request to L3 cache or to +# memory. The permissions requests do not come to this directory for region +# based protocols as they are handled exclusively by the region directory. +# However, region directory controller uses this directory controller for +# sending probe requests and receiving probe responses. +class DirCntrl(Directory_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.response_latency = 25 + self.response_latency_regionDir = 1 + self.directory = DirMem() + self.directory.create(options, ruby_system, system) + self.L3CacheMemory = L3Cache() + self.L3CacheMemory.create(options, ruby_system, system) + self.l3_hit_latency = \ + max(self.L3CacheMemory.dataAccessLatency, + self.L3CacheMemory.tagAccessLatency) + + self.ruby_system = ruby_system + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +# Region directory : Stores region permissions +class RegionDir(RubyCache): + + def create(self, options, ruby_system, system): + self.block_size = "%dB" % (64 * options.blocks_per_region) + self.size = options.region_dir_entries * \ + self.block_size * options.num_compute_units + self.assoc = 8 + self.tagArrayBanks = 8 + self.tagAccessLatency = options.dir_tag_latency + self.dataAccessLatency = 1 + self.resourceStalls = options.no_resource_stalls + self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2)) + self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc) +# Region directory controller : Contains region directory and associated state +# machine for dealing with region coherence requests. +class RegionCntrl(RegionDir_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.cacheMemory = RegionDir() + self.cacheMemory.create(options, ruby_system, system) + self.blocksPerRegion = options.blocks_per_region + self.toDirLatency = \ + max(self.cacheMemory.dataAccessLatency, + self.cacheMemory.tagAccessLatency) + self.ruby_system = ruby_system + self.always_migrate = options.always_migrate + self.sym_migrate = options.symmetric_migrate + self.asym_migrate = options.asymmetric_migrate + if self.always_migrate: + assert(not self.asym_migrate and not self.sym_migrate) + if self.sym_migrate: + assert(not self.always_migrate and not self.asym_migrate) + if self.asym_migrate: + assert(not self.always_migrate and not self.sym_migrate) + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +# Region Buffer: A region directory cache which avoids some potential +# long latency lookup of region directory for getting region permissions +class RegionBuffer(RubyCache): + assoc = 4 + dataArrayBanks = 256 + tagArrayBanks = 256 + dataAccessLatency = 1 + tagAccessLatency = 1 + resourceStalls = True + +class RBCntrl(RegionBuffer_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.cacheMemory = RegionBuffer() + self.cacheMemory.resourceStalls = options.no_tcc_resource_stalls + self.cacheMemory.dataArrayBanks = 64 + self.cacheMemory.tagArrayBanks = 64 + self.blocksPerRegion = options.blocks_per_region + self.cacheMemory.block_size = "%dB" % (64 * self.blocksPerRegion) + self.cacheMemory.start_index_bit = \ + 6 + int(math.log(self.blocksPerRegion, 2)) + self.cacheMemory.size = options.region_buffer_entries * \ + self.cacheMemory.block_size * options.num_compute_units + self.toDirLatency = options.gpu_to_dir_latency + self.toRegionDirLatency = options.cpu_to_dir_latency + self.noTCCdir = True + TCC_bits = int(math.log(options.num_tccs, 2)) + self.TCC_select_num_bits = TCC_bits + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + self.cacheMemory.replacement_policy = \ + PseudoLRUReplacementPolicy(assoc = self.cacheMemory.assoc) + +def define_options(parser): + parser.add_option("--num-subcaches", type="int", default=4) + parser.add_option("--l3-data-latency", type="int", default=20) + parser.add_option("--l3-tag-latency", type="int", default=15) + parser.add_option("--cpu-to-dir-latency", type="int", default=120) + parser.add_option("--gpu-to-dir-latency", type="int", default=60) + parser.add_option("--no-resource-stalls", action="store_false", + default=True) + parser.add_option("--no-tcc-resource-stalls", action="store_false", + default=True) + parser.add_option("--num-tbes", type="int", default=32) + parser.add_option("--l2-latency", type="int", default=50) # load to use + parser.add_option("--num-tccs", type="int", default=1, + help="number of TCC banks in the GPU") + + parser.add_option("--sqc-size", type='string', default='32kB', + help="SQC cache size") + parser.add_option("--sqc-assoc", type='int', default=8, + help="SQC cache assoc") + + parser.add_option("--WB_L1", action="store_true", + default=False, help="L2 Writeback Cache") + parser.add_option("--WB_L2", action="store_true", + default=False, help="L2 Writeback Cache") + parser.add_option("--TCP_latency", + type="int", default=4, help="TCP latency") + parser.add_option("--TCC_latency", + type="int", default=16, help="TCC latency") + parser.add_option("--tcc-size", type='string', default='2MB', + help="agregate tcc size") + parser.add_option("--tcc-assoc", type='int', default=16, + help="tcc assoc") + parser.add_option("--tcp-size", type='string', default='16kB', + help="tcp size") + + parser.add_option("--dir-tag-latency", type="int", default=4) + parser.add_option("--dir-tag-banks", type="int", default=4) + parser.add_option("--blocks-per-region", type="int", default=16) + parser.add_option("--dir-entries", type="int", default=8192) + + # Region buffer is a cache of region directory. Hence region + # directory is inclusive with respect to region directory. + # However, region directory is non-inclusive with respect to + # the caches in the system + parser.add_option("--region-dir-entries", type="int", default=1024) + parser.add_option("--region-buffer-entries", type="int", default=512) + + parser.add_option("--always-migrate", + action="store_true", default=False) + parser.add_option("--symmetric-migrate", + action="store_true", default=False) + parser.add_option("--asymmetric-migrate", + action="store_true", default=False) + parser.add_option("--use-L3-on-WT", action="store_true", default=False) + +def create_system(options, full_system, system, dma_devices, ruby_system): + if buildEnv['PROTOCOL'] != 'GPU_VIPER_Region': + panic("This script requires the GPU_VIPER_Region protocol to be built.") + + cpu_sequencers = [] + + # + # The ruby network creation expects the list of nodes in the system to be + # consistent with the NetDest list. Therefore the l1 controller nodes + # must be listed before the directory nodes and directory nodes before + # dma nodes, etc. + # + dir_cntrl_nodes = [] + + # For an odd number of CPUs, still create the right number of controllers + TCC_bits = int(math.log(options.num_tccs, 2)) + + # + # Must create the individual controllers before the network to ensure the + # controller constructors are called before the network constructor + # + + # For an odd number of CPUs, still create the right number of controllers + crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock + cpuCluster = Cluster(extBW = (crossbar_bw), intBW=crossbar_bw) + for i in xrange((options.num_cpus + 1) / 2): + + cp_cntrl = CPCntrl() + cp_cntrl.create(options, ruby_system, system) + + rb_cntrl = RBCntrl() + rb_cntrl.create(options, ruby_system, system) + rb_cntrl.number_of_TBEs = 256 + rb_cntrl.isOnCPU = True + + cp_cntrl.regionBufferNum = rb_cntrl.version + + exec("system.cp_cntrl%d = cp_cntrl" % i) + exec("system.rb_cntrl%d = rb_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1]) + + # Connect the CP controllers and the network + cp_cntrl.requestFromCore = MessageBuffer() + cp_cntrl.requestFromCore.master = ruby_system.network.slave + + cp_cntrl.responseFromCore = MessageBuffer() + cp_cntrl.responseFromCore.master = ruby_system.network.slave + + cp_cntrl.unblockFromCore = MessageBuffer() + cp_cntrl.unblockFromCore.master = ruby_system.network.slave + + cp_cntrl.probeToCore = MessageBuffer() + cp_cntrl.probeToCore.slave = ruby_system.network.master + + cp_cntrl.responseToCore = MessageBuffer() + cp_cntrl.responseToCore.slave = ruby_system.network.master + + cp_cntrl.mandatoryQueue = MessageBuffer() + cp_cntrl.triggerQueue = MessageBuffer(ordered = True) + + # Connect the RB controllers to the ruby network + rb_cntrl.requestFromCore = MessageBuffer(ordered = True) + rb_cntrl.requestFromCore.slave = ruby_system.network.master + + rb_cntrl.responseFromCore = MessageBuffer() + rb_cntrl.responseFromCore.slave = ruby_system.network.master + + rb_cntrl.requestToNetwork = MessageBuffer() + rb_cntrl.requestToNetwork.master = ruby_system.network.slave + + rb_cntrl.notifyFromRegionDir = MessageBuffer() + rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master + + rb_cntrl.probeFromRegionDir = MessageBuffer() + rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master + + rb_cntrl.unblockFromDir = MessageBuffer() + rb_cntrl.unblockFromDir.slave = ruby_system.network.master + + rb_cntrl.responseToRegDir = MessageBuffer() + rb_cntrl.responseToRegDir.master = ruby_system.network.slave + + rb_cntrl.triggerQueue = MessageBuffer(ordered = True) + + cpuCluster.add(cp_cntrl) + cpuCluster.add(rb_cntrl) + + gpuCluster = Cluster(extBW = (crossbar_bw), intBW = crossbar_bw) + for i in xrange(options.num_compute_units): + + tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits, + issue_latency = 1, + number_of_TBEs = 2560) + # TBEs set to max outstanding requests + tcp_cntrl.create(options, ruby_system, system) + tcp_cntrl.WB = options.WB_L1 + tcp_cntrl.disableL1 = False + + exec("system.tcp_cntrl%d = tcp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(tcp_cntrl.coalescer) + + # Connect the CP (TCP) controllers to the ruby network + tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.requestFromTCP.master = ruby_system.network.slave + + tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseFromTCP.master = ruby_system.network.slave + + tcp_cntrl.unblockFromCore = MessageBuffer() + tcp_cntrl.unblockFromCore.master = ruby_system.network.slave + + tcp_cntrl.probeToTCP = MessageBuffer(ordered = True) + tcp_cntrl.probeToTCP.slave = ruby_system.network.master + + tcp_cntrl.responseToTCP = MessageBuffer(ordered = True) + tcp_cntrl.responseToTCP.slave = ruby_system.network.master + + tcp_cntrl.mandatoryQueue = MessageBuffer() + + gpuCluster.add(tcp_cntrl) + + for i in xrange(options.num_sqc): + + sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + sqc_cntrl.create(options, ruby_system, system) + + exec("system.sqc_cntrl%d = sqc_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(sqc_cntrl.sequencer) + + # Connect the SQC controller to the ruby network + sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True) + sqc_cntrl.requestFromSQC.master = ruby_system.network.slave + + sqc_cntrl.probeToSQC = MessageBuffer(ordered = True) + sqc_cntrl.probeToSQC.slave = ruby_system.network.master + + sqc_cntrl.responseToSQC = MessageBuffer(ordered = True) + sqc_cntrl.responseToSQC.slave = ruby_system.network.master + + sqc_cntrl.mandatoryQueue = MessageBuffer() + + # SQC also in GPU cluster + gpuCluster.add(sqc_cntrl) + + numa_bit = 6 + + for i in xrange(options.num_tccs): + + tcc_cntrl = TCCCntrl() + tcc_cntrl.create(options, ruby_system, system) + tcc_cntrl.l2_request_latency = 1 + tcc_cntrl.l2_response_latency = options.TCC_latency + tcc_cntrl.WB = options.WB_L2 + tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units + + # Connect the TCC controllers to the ruby network + tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True) + tcc_cntrl.requestFromTCP.slave = ruby_system.network.master + + tcc_cntrl.responseToCore = MessageBuffer(ordered = True) + tcc_cntrl.responseToCore.master = ruby_system.network.slave + + tcc_cntrl.probeFromNB = MessageBuffer() + tcc_cntrl.probeFromNB.slave = ruby_system.network.master + + tcc_cntrl.responseFromNB = MessageBuffer() + tcc_cntrl.responseFromNB.slave = ruby_system.network.master + + tcc_cntrl.requestToNB = MessageBuffer(ordered = True) + tcc_cntrl.requestToNB.master = ruby_system.network.slave + + tcc_cntrl.responseToNB = MessageBuffer() + tcc_cntrl.responseToNB.master = ruby_system.network.slave + + tcc_cntrl.unblockToNB = MessageBuffer() + tcc_cntrl.unblockToNB.master = ruby_system.network.slave + + tcc_cntrl.triggerQueue = MessageBuffer(ordered = True) + + rb_cntrl = RBCntrl() + rb_cntrl.create(options, ruby_system, system) + rb_cntrl.number_of_TBEs = 2560 * options.num_compute_units + rb_cntrl.isOnCPU = False + + # Connect the RB controllers to the ruby network + rb_cntrl.requestFromCore = MessageBuffer(ordered = True) + rb_cntrl.requestFromCore.slave = ruby_system.network.master + + rb_cntrl.responseFromCore = MessageBuffer() + rb_cntrl.responseFromCore.slave = ruby_system.network.master + + rb_cntrl.requestToNetwork = MessageBuffer() + rb_cntrl.requestToNetwork.master = ruby_system.network.slave + + rb_cntrl.notifyFromRegionDir = MessageBuffer() + rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master + + rb_cntrl.probeFromRegionDir = MessageBuffer() + rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master + + rb_cntrl.unblockFromDir = MessageBuffer() + rb_cntrl.unblockFromDir.slave = ruby_system.network.master + + rb_cntrl.responseToRegDir = MessageBuffer() + rb_cntrl.responseToRegDir.master = ruby_system.network.slave + + rb_cntrl.triggerQueue = MessageBuffer(ordered = True) + + tcc_cntrl.regionBufferNum = rb_cntrl.version + + exec("system.tcc_cntrl%d = tcc_cntrl" % i) + exec("system.tcc_rb_cntrl%d = rb_cntrl" % i) + + # TCC cntrls added to the GPU cluster + gpuCluster.add(tcc_cntrl) + gpuCluster.add(rb_cntrl) + + # Because of wire buffers, num_l3caches must equal num_dirs + # Region coherence only works with 1 dir + assert(options.num_l3caches == options.num_dirs == 1) + + # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu + # Clusters + mainCluster = Cluster(intBW = crossbar_bw) + + dir_cntrl = DirCntrl() + dir_cntrl.create(options, ruby_system, system) + dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units + dir_cntrl.useL3OnWT = options.use_L3_on_WT + + # Connect the Directory controller to the ruby network + dir_cntrl.requestFromCores = MessageBuffer() + dir_cntrl.requestFromCores.slave = ruby_system.network.master + + dir_cntrl.responseFromCores = MessageBuffer() + dir_cntrl.responseFromCores.slave = ruby_system.network.master + + dir_cntrl.unblockFromCores = MessageBuffer() + dir_cntrl.unblockFromCores.slave = ruby_system.network.master + + dir_cntrl.probeToCore = MessageBuffer() + dir_cntrl.probeToCore.master = ruby_system.network.slave + + dir_cntrl.responseToCore = MessageBuffer() + dir_cntrl.responseToCore.master = ruby_system.network.slave + + dir_cntrl.reqFromRegBuf = MessageBuffer() + dir_cntrl.reqFromRegBuf.slave = ruby_system.network.master + + dir_cntrl.reqToRegDir = MessageBuffer(ordered = True) + dir_cntrl.reqToRegDir.master = ruby_system.network.slave + + dir_cntrl.reqFromRegDir = MessageBuffer(ordered = True) + dir_cntrl.reqFromRegDir.slave = ruby_system.network.master + + dir_cntrl.unblockToRegDir = MessageBuffer() + dir_cntrl.unblockToRegDir.master = ruby_system.network.slave + + dir_cntrl.triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.responseFromMemory = MessageBuffer() + + exec("system.dir_cntrl%d = dir_cntrl" % i) + dir_cntrl_nodes.append(dir_cntrl) + + mainCluster.add(dir_cntrl) + + reg_cntrl = RegionCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits) + reg_cntrl.create(options, ruby_system, system) + reg_cntrl.number_of_TBEs = options.num_tbes + reg_cntrl.cpuRegionBufferNum = system.rb_cntrl0.version + reg_cntrl.gpuRegionBufferNum = system.tcc_rb_cntrl0.version + + # Connect the Region Dir controllers to the ruby network + reg_cntrl.requestToDir = MessageBuffer(ordered = True) + reg_cntrl.requestToDir.master = ruby_system.network.slave + + reg_cntrl.notifyToRBuffer = MessageBuffer() + reg_cntrl.notifyToRBuffer.master = ruby_system.network.slave + + reg_cntrl.probeToRBuffer = MessageBuffer() + reg_cntrl.probeToRBuffer.master = ruby_system.network.slave + + reg_cntrl.responseFromRBuffer = MessageBuffer() + reg_cntrl.responseFromRBuffer.slave = ruby_system.network.master + + reg_cntrl.requestFromRegBuf = MessageBuffer() + reg_cntrl.requestFromRegBuf.slave = ruby_system.network.master + + reg_cntrl.triggerQueue = MessageBuffer(ordered = True) + + exec("system.reg_cntrl%d = reg_cntrl" % i) + + mainCluster.add(reg_cntrl) + + # Assuming no DMA devices + assert(len(dma_devices) == 0) + + # Add cpu/gpu clusters to main cluster + mainCluster.add(cpuCluster) + mainCluster.add(gpuCluster) + + ruby_system.network.number_of_virtual_networks = 10 + + return (cpu_sequencers, dir_cntrl_nodes, mainCluster) diff --git a/configs/ruby/MOESI_AMD_Base.py b/configs/ruby/MOESI_AMD_Base.py new file mode 100644 index 000000000..4c8ad28b0 --- /dev/null +++ b/configs/ruby/MOESI_AMD_Base.py @@ -0,0 +1,326 @@ +# +# Copyright (c) 2010-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from Ruby import create_topology +from Ruby import send_evicts + +from Cluster import Cluster +from Crossbar import Crossbar + +class CntrlBase: + _seqs = 0 + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + CntrlBase._seqs += 1 + return CntrlBase._seqs - 1 + + _cntrls = 0 + @classmethod + def cntrlCount(cls): + # Use CntlCount not class since we need global count + CntrlBase._cntrls += 1 + return CntrlBase._cntrls - 1 + + _version = 0 + @classmethod + def versionCount(cls): + cls._version += 1 # Use count for this particular type + return cls._version - 1 + +class L1DCache(RubyCache): + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.l1d_size) + self.assoc = options.l1d_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L1ICache(RubyCache): + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.l1i_size) + self.assoc = options.l1i_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L2Cache(RubyCache): + resourceStalls = False + def create(self, options): + self.size = MemorySize(options.l2_size) + self.assoc = options.l2_assoc + self.replacement_policy = PseudoLRUReplacementPolicy() + +class CPCntrl(CorePair_Controller, CntrlBase): + + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.L1Icache = L1ICache() + self.L1Icache.create(options) + self.L1D0cache = L1DCache() + self.L1D0cache.create(options) + self.L1D1cache = L1DCache() + self.L1D1cache.create(options) + self.L2cache = L2Cache() + self.L2cache.create(options) + + self.sequencer = RubySequencer() + self.sequencer.icache_hit_latency = 2 + self.sequencer.dcache_hit_latency = 2 + self.sequencer.version = self.seqCount() + self.sequencer.icache = self.L1Icache + self.sequencer.dcache = self.L1D0cache + self.sequencer.ruby_system = ruby_system + self.sequencer.coreid = 0 + self.sequencer.is_cpu_sequencer = True + + self.sequencer1 = RubySequencer() + self.sequencer1.version = self.seqCount() + self.sequencer1.icache = self.L1Icache + self.sequencer1.dcache = self.L1D1cache + self.sequencer1.icache_hit_latency = 2 + self.sequencer1.dcache_hit_latency = 2 + self.sequencer1.ruby_system = ruby_system + self.sequencer1.coreid = 1 + self.sequencer1.is_cpu_sequencer = True + + self.issue_latency = options.cpu_to_dir_latency + self.send_evictions = send_evicts(options) + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + +class L3Cache(RubyCache): + assoc = 8 + dataArrayBanks = 256 + tagArrayBanks = 256 + + def create(self, options, ruby_system, system): + self.size = MemorySize(options.l3_size) + self.size.value /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataArrayBanks /= options.num_dirs + self.tagArrayBanks /= options.num_dirs + self.dataAccessLatency = options.l3_data_latency + self.tagAccessLatency = options.l3_tag_latency + self.resourceStalls = options.no_resource_stalls + self.replacement_policy = PseudoLRUReplacementPolicy() + +class L3Cntrl(L3Cache_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + self.L3cache = L3Cache() + self.L3cache.create(options, ruby_system, system) + + self.l3_response_latency = max(self.L3cache.dataAccessLatency, + self.L3cache.tagAccessLatency) + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +class DirMem(RubyDirectoryMemory, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + phys_mem_size = AddrRange(options.mem_size).size() + mem_module_size = phys_mem_size / options.num_dirs + dir_size = MemorySize('0B') + dir_size.value = mem_module_size + self.size = dir_size + +class DirCntrl(Directory_Controller, CntrlBase): + def create(self, options, ruby_system, system): + self.version = self.versionCount() + + self.response_latency = 30 + + self.directory = DirMem() + self.directory.create(options, ruby_system, system) + + self.L3CacheMemory = L3Cache() + self.L3CacheMemory.create(options, ruby_system, system) + + self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency, + self.L3CacheMemory.tagAccessLatency) + + self.number_of_TBEs = options.num_tbes + + self.ruby_system = ruby_system + + if options.recycle_latency: + self.recycle_latency = options.recycle_latency + + self.CPUonly = True + + def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir, + req_to_l3, probe_to_l3, resp_to_l3): + self.reqToDir = req_to_dir + self.respToDir = resp_to_dir + self.l3UnblockToDir = l3_unblock_to_dir + self.reqToL3 = req_to_l3 + self.probeToL3 = probe_to_l3 + self.respToL3 = resp_to_l3 + +def define_options(parser): + parser.add_option("--num-subcaches", type="int", default=4) + parser.add_option("--l3-data-latency", type="int", default=20) + parser.add_option("--l3-tag-latency", type="int", default=15) + parser.add_option("--cpu-to-dir-latency", type="int", default=15) + parser.add_option("--no-resource-stalls", action="store_false", + default=True) + parser.add_option("--num-tbes", type="int", default=256) + parser.add_option("--l2-latency", type="int", default=50) # load to use + +def create_system(options, full_system, system, dma_devices, ruby_system): + if buildEnv['PROTOCOL'] != 'MOESI_AMD_Base': + panic("This script requires the MOESI_AMD_Base protocol.") + + cpu_sequencers = [] + + # + # The ruby network creation expects the list of nodes in the system to + # be consistent with the NetDest list. Therefore the l1 controller + # nodes must be listed before the directory nodes and directory nodes + # before dma nodes, etc. + # + l1_cntrl_nodes = [] + l3_cntrl_nodes = [] + dir_cntrl_nodes = [] + + control_count = 0 + + # + # Must create the individual controllers before the network to ensure + # the controller constructors are called before the network constructor + # + + # This is the base crossbar that connects the L3s, Dirs, and cpu + # Cluster + mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s + for i in xrange(options.num_dirs): + + dir_cntrl = DirCntrl(TCC_select_num_bits = 0) + dir_cntrl.create(options, ruby_system, system) + + # Connect the Directory controller to the ruby network + dir_cntrl.requestFromCores = MessageBuffer(ordered = True) + dir_cntrl.requestFromCores.slave = ruby_system.network.master + + dir_cntrl.responseFromCores = MessageBuffer() + dir_cntrl.responseFromCores.slave = ruby_system.network.master + + dir_cntrl.unblockFromCores = MessageBuffer() + dir_cntrl.unblockFromCores.slave = ruby_system.network.master + + dir_cntrl.probeToCore = MessageBuffer() + dir_cntrl.probeToCore.master = ruby_system.network.slave + + dir_cntrl.responseToCore = MessageBuffer() + dir_cntrl.responseToCore.master = ruby_system.network.slave + + dir_cntrl.triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True) + dir_cntrl.responseFromMemory = MessageBuffer() + + exec("system.dir_cntrl%d = dir_cntrl" % i) + dir_cntrl_nodes.append(dir_cntrl) + + mainCluster.add(dir_cntrl) + + # Technically this config can support an odd number of cpus, but the top + # level config files, such as the ruby_random_tester, will get confused if + # the number of cpus does not equal the number of sequencers. Thus make + # sure that an even number of cpus is specified. + assert((options.num_cpus % 2) == 0) + + # For an odd number of CPUs, still create the right number of controllers + cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s + for i in xrange((options.num_cpus + 1) / 2): + + cp_cntrl = CPCntrl() + cp_cntrl.create(options, ruby_system, system) + + exec("system.cp_cntrl%d = cp_cntrl" % i) + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1]) + + # Connect the CP controllers and the network + cp_cntrl.requestFromCore = MessageBuffer() + cp_cntrl.requestFromCore.master = ruby_system.network.slave + + cp_cntrl.responseFromCore = MessageBuffer() + cp_cntrl.responseFromCore.master = ruby_system.network.slave + + cp_cntrl.unblockFromCore = MessageBuffer() + cp_cntrl.unblockFromCore.master = ruby_system.network.slave + + cp_cntrl.probeToCore = MessageBuffer() + cp_cntrl.probeToCore.slave = ruby_system.network.master + + cp_cntrl.responseToCore = MessageBuffer() + cp_cntrl.responseToCore.slave = ruby_system.network.master + + cp_cntrl.mandatoryQueue = MessageBuffer() + cp_cntrl.triggerQueue = MessageBuffer(ordered = True) + + cpuCluster.add(cp_cntrl) + + # Assuming no DMA devices + assert(len(dma_devices) == 0) + + # Add cpu/gpu clusters to main cluster + mainCluster.add(cpuCluster) + + ruby_system.network.number_of_virtual_networks = 10 + + return (cpu_sequencers, dir_cntrl_nodes, mainCluster) diff --git a/src/SConscript b/src/SConscript index 322212cb7..2bac0bff3 100755 --- a/src/SConscript +++ b/src/SConscript @@ -78,7 +78,7 @@ class SourceMeta(type): def __init__(cls, name, bases, dict): super(SourceMeta, cls).__init__(name, bases, dict) cls.all = [] - + def get(cls, **guards): '''Find all files that match the specified guards. If a source file does not specify a flag, the default is False''' @@ -367,9 +367,9 @@ def makeTheISA(source, target, env): target_isa = env['TARGET_ISA'] def define(isa): return isa.upper() + '_ISA' - + def namespace(isa): - return isa[0].upper() + isa[1:].lower() + 'ISA' + return isa[0].upper() + isa[1:].lower() + 'ISA' code = code_formatter() @@ -407,6 +407,51 @@ def makeTheISA(source, target, env): env.Command('config/the_isa.hh', map(Value, all_isa_list), MakeAction(makeTheISA, Transform("CFG ISA", 0))) +def makeTheGPUISA(source, target, env): + isas = [ src.get_contents() for src in source ] + target_gpu_isa = env['TARGET_GPU_ISA'] + def define(isa): + return isa.upper() + '_ISA' + + def namespace(isa): + return isa[0].upper() + isa[1:].lower() + 'ISA' + + + code = code_formatter() + code('''\ +#ifndef __CONFIG_THE_GPU_ISA_HH__ +#define __CONFIG_THE_GPU_ISA_HH__ + +''') + + # create defines for the preprocessing and compile-time determination + for i,isa in enumerate(isas): + code('#define $0 $1', define(isa), i + 1) + code() + + # create an enum for any run-time determination of the ISA, we + # reuse the same name as the namespaces + code('enum class GPUArch {') + for i,isa in enumerate(isas): + if i + 1 == len(isas): + code(' $0 = $1', namespace(isa), define(isa)) + else: + code(' $0 = $1,', namespace(isa), define(isa)) + code('};') + + code(''' + +#define THE_GPU_ISA ${{define(target_gpu_isa)}} +#define TheGpuISA ${{namespace(target_gpu_isa)}} +#define THE_GPU_ISA_STR "${{target_gpu_isa}}" + +#endif // __CONFIG_THE_GPU_ISA_HH__''') + + code.write(str(target[0])) + +env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list), + MakeAction(makeTheGPUISA, Transform("CFG ISA", 0))) + ######################################################################## # # Prevent any SimObjects from being added after this point, they @@ -784,7 +829,7 @@ extern "C" { EmbeddedSwig embed_swig_${module}(init_${module}); ''') code.write(str(target[0])) - + # Build all swig modules for swig in SwigSource.all: env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode, @@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = { x = array.array('B', data[i:i+step]) code(''.join('%d,' % d for d in x)) code.dedent() - + code('''}; EmbeddedPython embedded_${sym}( diff --git a/src/arch/SConscript b/src/arch/SConscript index e0d6845f5..b022cb01f 100644 --- a/src/arch/SConscript +++ b/src/arch/SConscript @@ -68,6 +68,14 @@ isa_switch_hdrs = Split(''' # Set up this directory to support switching headers make_switching_dir('arch', isa_switch_hdrs, env) +if env['BUILD_GPU']: + gpu_isa_switch_hdrs = Split(''' + gpu_decoder.hh + gpu_types.hh + ''') + + make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env) + ################################################################# # # Include architecture-specific files. diff --git a/src/arch/hsail/Brig.h b/src/arch/hsail/Brig.h new file mode 100644 index 000000000..b260157ab --- /dev/null +++ b/src/arch/hsail/Brig.h @@ -0,0 +1,67 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. +#ifndef INTERNAL_BRIG_H +#define INTERNAL_BRIG_H + +#include + +namespace Brig { +#include "Brig_new.hpp" + +// These typedefs provide some backward compatibility with earlier versions +// of Brig.h, reducing the number of code changes. The distinct names also +// increase legibility by showing the code's intent. +typedef BrigBase BrigDirective; +typedef BrigBase BrigOperand; + +enum BrigMemoryFenceSegments { // for internal use only + //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc } + //.mnemo_token=_EMMemoryFenceSegments + //.mnemo_context=EInstModifierInstFenceContext + BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0, + BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1, + BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2, + BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip +}; + +} + +#endif // defined(INTERNAL_BRIG_H) diff --git a/src/arch/hsail/Brig_new.hpp b/src/arch/hsail/Brig_new.hpp new file mode 100644 index 000000000..60e6f4dea --- /dev/null +++ b/src/arch/hsail/Brig_new.hpp @@ -0,0 +1,1587 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013-2015, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. + +//.ignore{ + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include + +enum BrigAuxDefs { + MAX_OPERANDS_NUM = 6 +}; + +//} + +typedef uint32_t BrigVersion32_t; + +enum BrigVersion { + + //.nowrap + //.nodump + //.nollvm + + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint8_t BrigAlignment8_t; //.defValue=BRIG_ALIGNMENT_NONE + +typedef uint8_t BrigAllocation8_t; //.defValue=BRIG_ALLOCATION_NONE + +typedef uint8_t BrigAluModifier8_t; + +typedef uint8_t BrigAtomicOperation8_t; + +typedef uint32_t BrigCodeOffset32_t; //.defValue=0 //.wtype=ItemRef + +typedef uint8_t BrigCompareOperation8_t; + +typedef uint16_t BrigControlDirective16_t; + +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; //.wtype=ListRef //.defValue=0 + +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; //.wtype=ListRef //.defValue=0 + +typedef BrigDataOffset32_t BrigDataOffsetString32_t; //.wtype=StrRef //.defValue=0 + +typedef uint8_t BrigExecutableModifier8_t; + +typedef uint8_t BrigImageChannelOrder8_t; //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN + +typedef uint8_t BrigImageChannelType8_t; //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN + +typedef uint8_t BrigImageGeometry8_t; //.defValue=BRIG_GEOMETRY_UNKNOWN + +typedef uint8_t BrigImageQuery8_t; + +typedef uint16_t BrigKind16_t; + +typedef uint8_t BrigLinkage8_t; //.defValue=BRIG_LINKAGE_NONE + +typedef uint8_t BrigMachineModel8_t; //.defValue=BRIG_MACHINE_LARGE + +typedef uint8_t BrigMemoryModifier8_t; + +typedef uint8_t BrigMemoryOrder8_t; //.defValue=BRIG_MEMORY_ORDER_RELAXED + +typedef uint8_t BrigMemoryScope8_t; //.defValue=BRIG_MEMORY_SCOPE_SYSTEM + +typedef uint16_t BrigOpcode16_t; + +typedef uint32_t BrigOperandOffset32_t; //.defValue=0 //.wtype=ItemRef + +typedef uint8_t BrigPack8_t; //.defValue=BRIG_PACK_NONE + +typedef uint8_t BrigProfile8_t; //.defValue=BRIG_PROFILE_FULL + +typedef uint16_t BrigRegisterKind16_t; + +typedef uint8_t BrigRound8_t; //.defValue=BRIG_ROUND_NONE + +typedef uint8_t BrigSamplerAddressing8_t; //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE + +typedef uint8_t BrigSamplerCoordNormalization8_t; + +typedef uint8_t BrigSamplerFilter8_t; + +typedef uint8_t BrigSamplerQuery8_t; + +typedef uint32_t BrigSectionIndex32_t; + +typedef uint8_t BrigSegCvtModifier8_t; + +typedef uint8_t BrigSegment8_t; //.defValue=BRIG_SEGMENT_NONE + +typedef uint32_t BrigStringOffset32_t; //.defValue=0 //.wtype=StrRef + +typedef uint16_t BrigType16_t; + +typedef uint8_t BrigVariableModifier8_t; + +typedef uint8_t BrigWidth8_t; + +typedef uint32_t BrigExceptions32_t; + +enum BrigKind { + + //.nollvm + // + //.wname={ s/^BRIG_KIND//; MACRO2Name($_) } + //.mnemo=$wname{ $wname } + // + //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" } + //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1" + // + //.isBodyOnly={ "false" } + //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()" + //.isBodyOnly_default="assert(false); return false" + // + //.isToplevelOnly={ "false" } + //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()" + //.isToplevelOnly_default="assert(false); return false" + + BRIG_KIND_NONE = 0x0000, //.skip + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, //.skip + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, //.skip + + BRIG_KIND_INST_BEGIN = 0x2000, //.skip + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, //.skip + + BRIG_KIND_OPERAND_BEGIN = 0x3000, //.skip + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d //.skip +}; + +enum BrigAlignment { + + //.mnemo={ s/^BRIG_ALIGNMENT_//; lc } + //.mnemo_proto="const char* align2str(unsigned arg)" + // + //.bytes={ /(\d+)/ ? $1 : undef } + //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1" + // + //.rbytes=$bytes{ $bytes } + //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)" + //.rbytes_default="return BRIG_ALIGNMENT_LAST" + // + //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" } + + BRIG_ALIGNMENT_NONE = 0, //.no_mnemo + BRIG_ALIGNMENT_1 = 1, //.mnemo="" + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + + BRIG_ALIGNMENT_LAST, //.skip + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1 //.skip +}; + +enum BrigAllocation { + + //.mnemo={ s/^BRIG_ALLOCATION_//;lc } + //.mnemo_token=EAllocKind + + BRIG_ALLOCATION_NONE = 0, //.mnemo="" + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +enum BrigAtomicOperation { + + //.tdcaption="Atomic Operations" + // + //.mnemo={ s/^BRIG_ATOMIC_//;lc } + //.mnemo_token=_EMAtomicOp + //.mnemo_context=EInstModifierInstAtomicContext + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +enum BrigCompareOperation { + + //.tdcaption="Comparison Operators" + // + //.mnemo={ s/^BRIG_COMPARE_//;lc } + //.mnemo_token=_EMCompare + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +enum BrigControlDirective { + + //.mnemo={ s/^BRIG_CONTROL_//;lc } + //.mnemo_token=EControl + // + //.print=$mnemo{ $mnemo } + + BRIG_CONTROL_NONE = 0, //.skip + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +enum BrigExecutableModifierMask { + //.nodump + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +enum BrigImageChannelOrder { + + //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc } + //.mnemo_token=EImageOrder + //.mnemo_context=EImageOrderContext + // + //.print=$mnemo{ $mnemo } + + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + // used internally + BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip + +}; + +enum BrigImageChannelType { + + //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc } + //.mnemo_token=EImageFormat + // + //.print=$mnemo{ $mnemo } + + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + // used internally + BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo="" + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigImageGeometry { + + //.tdcaption="Geometry" + // + //.mnemo={ s/^BRIG_GEOMETRY_//;lc } + //.mnemo_token=EImageGeometry + // + //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef} + //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo" + //.dim_default="assert(0); return 0" + // + //.depth={/DEPTH$/?"true":"false"} + //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo" + //.depth_default="return false" + + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + // used internally + BRIG_GEOMETRY_UNKNOWN, //.mnemo="" + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigImageQuery { + + //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc } + // + //.print=$mnemo{ $mnemo } + + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + BRIG_IMAGE_QUERY_NUMMIPLEVELS = 6 +}; + +enum BrigLinkage { + + //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc } + + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +enum BrigMachineModel { + + //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc } + //.mnemo_token=ETargetMachine + // + //.print=$mnemo{ $mnemo } + + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, + + BRIG_MACHINE_UNDEF = 2 //.skip +}; + +enum BrigMemoryModifierMask { //.tddef=0 + BRIG_MEMORY_CONST = 1 +}; + +enum BrigMemoryOrder { + + //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc } + //.mnemo_token=_EMMemoryOrder + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_MEMORY_ORDER_NONE = 0, //.mnemo="" + BRIG_MEMORY_ORDER_RELAXED = 1, //.mnemo=rlx + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, //.mnemo=scacq + BRIG_MEMORY_ORDER_SC_RELEASE = 3, //.mnemo=screl + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, //.mnemo=scar + + BRIG_MEMORY_ORDER_LAST = 5 //.skip +}; + +enum BrigMemoryScope { + + //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc } + //.mnemo_token=_EMMemoryScope + // + //.print=$mnemo{ $mnemo } + + BRIG_MEMORY_SCOPE_NONE = 0, //.mnemo="" + BRIG_MEMORY_SCOPE_WORKITEM = 1, //.mnemo="" + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, //.mnemo=wave + BRIG_MEMORY_SCOPE_WORKGROUP = 3, //.mnemo=wg + BRIG_MEMORY_SCOPE_AGENT = 4, //.mnemo=agent + BRIG_MEMORY_SCOPE_SYSTEM = 5, //.mnemo=system + + BRIG_MEMORY_SCOPE_LAST = 6 //.skip +}; + +enum BrigOpcode { + + //.tdcaption="Instruction Opcodes" + // + //.k={ "BASIC" } + //.pscode=$k{ MACRO2Name("_".$k) } + //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" } + //.opcodeparser_incfile=ParserUtilities + //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic" + // + //.psopnd={undef} + //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" } + //.opndparser_incfile=ParserUtilities + //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands" + // + //.mnemo={ s/^BRIG_OPCODE_//; s/GCN([^_])/GCN_$1/; lc } + //.mnemo_scanner=Instructions //.mnemo_token=EInstruction + //.mnemo_context=EDefaultContext + // + //.has_memory_order={undef} + //.semsupport=$has_memory_order{ return $has_memory_order && "true" } + // + //.hasType=$k{ return ($k and $k eq "BASIC_NO_TYPE") ? "false" : undef; } + //.hasType_switch //.hasType_proto="bool instHasType(BrigOpcode16_t arg)" //.hasType_default="return true" + // + //.opcodevis=$pscode{ s/^BRIG_OPCODE_//; sprintf("%-47s(","vis.visitOpcode_".$_) . ($pscode =~m/^(BasicOrMod|Nop)$/? "inst" : "HSAIL_ASM::Inst". ($pscode=~m/BasicNoType/? "Basic":$pscode) ."(inst)").")" } + //.opcodevis_switch //.opcodevis_proto="template RetType visitOpcode_gen(HSAIL_ASM::Inst inst, Visitor& vis)" + //.opcodevis_arg="inst.opcode()" //.opcodevis_default="return RetType()" + //.opcodevis_incfile=ItemUtils + // + //.ftz=$k{ return ($k eq "BASIC_OR_MOD" or $k eq "CMP" or $k eq "CVT") ? "true" : undef } + //.ftz_incfile=ItemUtils //.ftz_switch //.ftz_proto="inline bool instSupportsFtz(BrigOpcode16_t arg)" //.ftz_default="return false" + // + //.vecOpndIndex={undef} + //.vecOpndIndex_switch //.vecOpndIndex_proto="int vecOpndIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1" + //.vecOpndIndex_incfile=ParserUtilities + // + //.numdst={undef} + //.numdst_switch //.numdst_proto="int instNumDstOperands(BrigOpcode16_t arg)" //.numdst_default="return 1" + // + //.print=$mnemo{ $mnemo } + + BRIG_OPCODE_NOP = 0, //.k=NOP //.hasType=false + BRIG_OPCODE_ABS = 1, //.k=BASIC_OR_MOD + BRIG_OPCODE_ADD = 2, //.k=BASIC_OR_MOD + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, //.k=BASIC_OR_MOD + BRIG_OPCODE_COPYSIGN = 6, //.k=BASIC_OR_MOD + BRIG_OPCODE_DIV = 7, //.k=BASIC_OR_MOD + BRIG_OPCODE_FLOOR = 8, //.k=BASIC_OR_MOD + BRIG_OPCODE_FMA = 9, //.k=BASIC_OR_MOD + BRIG_OPCODE_FRACT = 10, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAD = 11, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAX = 12, //.k=BASIC_OR_MOD + BRIG_OPCODE_MIN = 13, //.k=BASIC_OR_MOD + BRIG_OPCODE_MUL = 14, //.k=BASIC_OR_MOD + BRIG_OPCODE_MULHI = 15, //.k=BASIC_OR_MOD + BRIG_OPCODE_NEG = 16, //.k=BASIC_OR_MOD + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, //.k=BASIC_OR_MOD + BRIG_OPCODE_SQRT = 19, //.k=BASIC_OR_MOD + BRIG_OPCODE_SUB = 20, //.k=BASIC_OR_MOD + BRIG_OPCODE_TRUNC = 21, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, //.k=SOURCE_TYPE + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, //.k=SOURCE_TYPE + BRIG_OPCODE_LASTBIT = 39, //.k=SOURCE_TYPE + BRIG_OPCODE_COMBINE = 40, //.k=SOURCE_TYPE //.vecOpndIndex=1 + BRIG_OPCODE_EXPAND = 41, //.k=SOURCE_TYPE //.vecOpndIndex=0 + BRIG_OPCODE_LDA = 42, //.k=ADDR + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, //.k=SOURCE_TYPE + BRIG_OPCODE_UNPACK = 48, //.k=SOURCE_TYPE + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, //.k=SOURCE_TYPE + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, //.k=SOURCE_TYPE + BRIG_OPCODE_UNPACKCVT = 62, //.k=SOURCE_TYPE + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, //.k=SOURCE_TYPE + BRIG_OPCODE_SADHI = 65, //.k=SOURCE_TYPE + BRIG_OPCODE_SEGMENTP = 66, //.k=SEG_CVT + BRIG_OPCODE_FTOS = 67, //.k=SEG_CVT + BRIG_OPCODE_STOF = 68, //.k=SEG_CVT + BRIG_OPCODE_CMP = 69, //.k=CMP + BRIG_OPCODE_CVT = 70, //.k=CVT + BRIG_OPCODE_LD = 71, //.k=MEM //.has_memory_order //.vecOpndIndex=0 + BRIG_OPCODE_ST = 72, //.k=MEM //.has_memory_order //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_ATOMIC = 73, //.k=ATOMIC + BRIG_OPCODE_ATOMICNORET = 74, //.k=ATOMIC //.numdst=0 + BRIG_OPCODE_SIGNAL = 75, //.k=SIGNAL + BRIG_OPCODE_SIGNALNORET = 76, //.k=SIGNAL //.numdst=0 + BRIG_OPCODE_MEMFENCE = 77, //.k=MEM_FENCE //.numdst=0 + BRIG_OPCODE_RDIMAGE = 78, //.k=IMAGE //.vecOpndIndex=0 + BRIG_OPCODE_LDIMAGE = 79, //.k=IMAGE //.vecOpndIndex=0 + BRIG_OPCODE_STIMAGE = 80, //.k=IMAGE //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_IMAGEFENCE = 81, //.k=BASIC_NO_TYPE + BRIG_OPCODE_QUERYIMAGE = 82, //.k=QUERY_IMAGE + BRIG_OPCODE_QUERYSAMPLER = 83, //.k=QUERY_SAMPLER + BRIG_OPCODE_CBR = 84, //.k=BR //.numdst=0 + BRIG_OPCODE_BR = 85, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_SBR = 86, //.k=BR //.numdst=0 //.psopnd=SbrOperands + BRIG_OPCODE_BARRIER = 87, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_WAVEBARRIER = 88, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_ARRIVEFBAR = 89, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_INITFBAR = 90, //.k=BASIC_NO_TYPE //.numdst=0 //.hasType=false + BRIG_OPCODE_JOINFBAR = 91, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_LEAVEFBAR = 92, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_RELEASEFBAR = 93, //.k=BASIC_NO_TYPE //.numdst=0 + BRIG_OPCODE_WAITFBAR = 94, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, //.k=LANE + BRIG_OPCODE_ACTIVELANEID = 97, //.k=LANE + BRIG_OPCODE_ACTIVELANEMASK = 98, //.k=LANE //.vecOpndIndex=0 + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, //.k=LANE + BRIG_OPCODE_CALL = 100, //.k=BR //.psopnd=CallOperands //.numdst=0 //.hasType=false + BRIG_OPCODE_SCALL = 101, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_ICALL = 102, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_RET = 103, //.k=BASIC_NO_TYPE + BRIG_OPCODE_ALLOCA = 104, //.k=MEM + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, //.numdst=0 + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, //.numdst=0 + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, //.k=QUEUE + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, //.k=QUEUE + BRIG_OPCODE_LDQUEUEREADINDEX = 123, //.k=QUEUE + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, //.k=QUEUE + BRIG_OPCODE_STQUEUEREADINDEX = 125, //.k=QUEUE //.numdst=0 + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, //.k=QUEUE //.numdst=0 + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, //.numdst=0 + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, //.k=SEG + BRIG_OPCODE_WAVEID = 136, + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip + + BRIG_OPCODE_GCNMADU = (1u << 15) | 0, //.k=BASIC_NO_TYPE + BRIG_OPCODE_GCNMADS = (1u << 15) | 1, //.k=BASIC_NO_TYPE + BRIG_OPCODE_GCNMAX3 = (1u << 15) | 2, + BRIG_OPCODE_GCNMIN3 = (1u << 15) | 3, + BRIG_OPCODE_GCNMED3 = (1u << 15) | 4, + BRIG_OPCODE_GCNFLDEXP = (1u << 15) | 5, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNFREXP_EXP = (1u << 15) | 6, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNFREXP_MANT = (1u << 15) | 7, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNTRIG_PREOP = (1u << 15) | 8, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNBFM = (1u << 15) | 9, + BRIG_OPCODE_GCNLD = (1u << 15) | 10, //.k=MEM //.has_memory_order //.vecOpndIndex=0 + BRIG_OPCODE_GCNST = (1u << 15) | 11, //.k=MEM //.has_memory_order //.vecOpndIndex=0 + BRIG_OPCODE_GCNATOMIC = (1u << 15) | 12, //.k=ATOMIC + BRIG_OPCODE_GCNATOMICNORET = (1u << 15) | 13, //.k=ATOMIC //.mnemo=gcn_atomicNoRet + BRIG_OPCODE_GCNSLEEP = (1u << 15) | 14, + BRIG_OPCODE_GCNPRIORITY = (1u << 15) | 15, + BRIG_OPCODE_GCNREGIONALLOC = (1u << 15) | 16, //.k=BASIC_NO_TYPE //.mnemo=gcn_region_alloc + BRIG_OPCODE_GCNMSAD = (1u << 15) | 17, + BRIG_OPCODE_GCNQSAD = (1u << 15) | 18, + BRIG_OPCODE_GCNMQSAD = (1u << 15) | 19, + BRIG_OPCODE_GCNMQSAD4 = (1u << 15) | 20, //.k=BASIC_NO_TYPE + BRIG_OPCODE_GCNSADW = (1u << 15) | 21, + BRIG_OPCODE_GCNSADD = (1u << 15) | 22, + BRIG_OPCODE_GCNCONSUME = (1u << 15) | 23, //.k=ADDR //.mnemo=gcn_atomic_consume + BRIG_OPCODE_GCNAPPEND = (1u << 15) | 24, //.k=ADDR //.mnemo=gcn_atomic_append + BRIG_OPCODE_GCNB4XCHG = (1u << 15) | 25, //.mnemo=gcn_b4xchg + BRIG_OPCODE_GCNB32XCHG = (1u << 15) | 26, //.mnemo=gcn_b32xchg + BRIG_OPCODE_GCNMAX = (1u << 15) | 27, + BRIG_OPCODE_GCNMIN = (1u << 15) | 28, + BRIG_OPCODE_GCNDIVRELAXED = (1u << 15) | 29, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNDIVRELAXEDNARROW = (1u << 15) | 30, + + BRIG_OPCODE_AMDRDIMAGELOD = (1u << 15) | 31, //.k=IMAGE //.mnemo=amd_rdimagelod //.vecOpndIndex=0 + BRIG_OPCODE_AMDRDIMAGEGRAD = (1u << 15) | 32, //.k=IMAGE //.mnemo=amd_rdimagegrad //.vecOpndIndex=0 + BRIG_OPCODE_AMDLDIMAGEMIP = (1u << 15) | 33, //.k=IMAGE //.mnemo=amd_ldimagemip //.vecOpndIndex=0 + BRIG_OPCODE_AMDSTIMAGEMIP = (1u << 15) | 34, //.k=IMAGE //.mnemo=amd_stimagemip //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_AMDQUERYIMAGE = (1u << 15) | 35 //.k=QUERY_IMAGE //.mnemo=amd_queryimage +}; + +enum BrigPack { + + //.tdcaption="Packing" + // + //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc } + //.mnemo_token=_EMPacking + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_PACK_NONE = 0, //.mnemo="" + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +enum BrigProfile { + + //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc } + //.mnemo_token=ETargetProfile + // + //.print=$mnemo{ $mnemo } + + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, + + BRIG_PROFILE_UNDEF = 2 //.skip +}; + +enum BrigRegisterKind { + + //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) } + // + //.bits={ } + //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1" + // + //.nollvm + + BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1 + BRIG_REGISTER_KIND_SINGLE = 1, //.bits=32 + BRIG_REGISTER_KIND_DOUBLE = 2, //.bits=64 + BRIG_REGISTER_KIND_QUAD = 3 //.bits=128 +}; + +enum BrigRound { + + //.mnemo={} + //.mnemo_fn=round2str //.mnemo_token=_EMRound + // + //.sat={/_SAT$/? "true" : "false"} + //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding" + //.sat_default="return false" + // + //.sig={/_SIGNALING_/? "true" : "false"} + //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding" + //.sig_default="return false" + // + //.int={/_INTEGER_/? "true" : "false"} + //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding" + //.int_default="return false" + // + //.flt={/_FLOAT_/? "true" : "false"} + //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding" + //.flt_default="return false" + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_ROUND_NONE = 0, //.no_mnemo + BRIG_ROUND_FLOAT_DEFAULT = 1, //.no_mnemo + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, //.mnemo=near + BRIG_ROUND_FLOAT_ZERO = 3, //.mnemo=zero + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, //.mnemo=up + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, //.mnemo=down + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, //.mnemo=neari + BRIG_ROUND_INTEGER_ZERO = 7, //.mnemo=zeroi + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, //.mnemo=upi + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, //.mnemo=downi + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, //.mnemo=neari_sat + BRIG_ROUND_INTEGER_ZERO_SAT = 11, //.mnemo=zeroi_sat + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, //.mnemo=upi_sat + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, //.mnemo=downi_sat + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, //.mnemo=sneari + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, //.mnemo=szeroi + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, //.mnemo=supi + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, //.mnemo=sdowni + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, //.mnemo=sneari_sat + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, //.mnemo=szeroi_sat + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, //.mnemo=supi_sat + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 //.mnemo=sdowni_sat +}; + +enum BrigSamplerAddressing { + + //.mnemo={ s/^BRIG_ADDRESSING_//;lc } + //.mnemo_token=ESamplerAddressingMode + + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigSamplerCoordNormalization { + + //.mnemo={ s/^BRIG_COORD_//;lc } + //.mnemo_token=ESamplerCoord + // + //.print=$mnemo{ $mnemo } + + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +enum BrigSamplerFilter { + + //.mnemo={ s/^BRIG_FILTER_//;lc } + // + //.print=$mnemo{ $mnemo } + + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigSamplerQuery { + + //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc } + //.mnemo_token=_EMSamplerQuery + // + //.print=$mnemo{ $mnemo } + + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +enum BrigSectionIndex { + + //.nollvm + // + //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc } + + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, + + // used internally + BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip +}; + +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 //.mnemo="nonull" //.print="_nonull" +}; + +enum BrigSegment { + + //.mnemo={ s/^BRIG_SEGMENT_//;lc} + //.mnemo_token=_EMSegment + //.mnemo_context=EInstModifierContext + // + //.print=$mnemo{ $mnemo ? "_$mnemo" : "" } + + BRIG_SEGMENT_NONE = 0, //.mnemo="" + BRIG_SEGMENT_FLAT = 1, //.mnemo="" + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128, //.skip + + BRIG_SEGMENT_AMD_GCN = 9, //.mnemo="region" +}; + +enum BrigPackedTypeBits { + + //.nodump + // + //.nollvm + + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +enum BrigType { + + //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef } + //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0" + //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef } + //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0" + // + //.mnemo={ s/^BRIG_TYPE_//;lc } + //.mnemo_token=_EMType + // + //.array={/ARRAY$/?"true":"false"} + //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type" + //.array_default="return false" + // + //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"} + //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type" + //.a2e_default="return BRIG_TYPE_NONE" + // + //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"} + //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type" + //.e2a_default="return BRIG_TYPE_NONE" + // + //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc} + //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type" + //.t2s_default="return NULL" + // + //.dispatch_switch //.dispatch_incfile=TemplateUtilities + //.dispatch_proto="template\nRetType dispatchByType_gen(unsigned type, Visitor& v)" + //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" } + //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)" + // + //- .tdname=BrigType + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_TYPE_NONE = 0, //.mnemo="" //.print="" + BRIG_TYPE_U8 = 1, //.ctype=uint8_t + BRIG_TYPE_U16 = 2, //.ctype=uint16_t + BRIG_TYPE_U32 = 3, //.ctype=uint32_t + BRIG_TYPE_U64 = 4, //.ctype=uint64_t + BRIG_TYPE_S8 = 5, //.ctype=int8_t + BRIG_TYPE_S16 = 6, //.ctype=int16_t + BRIG_TYPE_S32 = 7, //.ctype=int32_t + BRIG_TYPE_S64 = 8, //.ctype=int64_t + BRIG_TYPE_F16 = 9, //.ctype=f16_t + BRIG_TYPE_F32 = 10, //.ctype=float + BRIG_TYPE_F64 = 11, //.ctype=double + BRIG_TYPE_B1 = 12, //.ctype=bool //.numBytes=1 + BRIG_TYPE_B8 = 13, //.ctype=uint8_t + BRIG_TYPE_B16 = 14, //.ctype=uint16_t + BRIG_TYPE_B32 = 15, //.ctype=uint32_t + BRIG_TYPE_B64 = 16, //.ctype=uint64_t + BRIG_TYPE_B128 = 17, //.ctype=b128_t + BRIG_TYPE_SAMP = 18, //.mnemo=samp //.numBits=64 + BRIG_TYPE_ROIMG = 19, //.mnemo=roimg //.numBits=64 + BRIG_TYPE_WOIMG = 20, //.mnemo=woimg //.numBits=64 + BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg //.numBits=64 + BRIG_TYPE_SIG32 = 22, //.mnemo=sig32 //.numBits=64 + BRIG_TYPE_SIG64 = 23, //.mnemo=sig64 //.numBits=64 + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, //.ctype=uint8_t + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, //.ctype=uint8_t + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, //.ctype=uint8_t + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, //.ctype=uint16_t + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, //.ctype=uint16_t + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, //.ctype=uint32_t + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, //.ctype=int8_t + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, //.ctype=int8_t + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, //.ctype=int8_t + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, //.ctype=int16_t + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, //.ctype=int16_t + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, //.ctype=int32_t + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, //.ctype=f16_t + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, //.ctype=f16_t + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, //.ctype=float + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + + // Used internally + BRIG_TYPE_INVALID = (unsigned) -1 //.skip +}; + +enum BrigVariableModifierMask { + + //.nodump + + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +enum BrigWidth { + + //.tddef=1 + // + //.print={ s/^BRIG_WIDTH_//; "_width($_)" } + + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, + + BRIG_WIDTH_LAST //.skip +}; + +struct BrigUInt64 { //.isroot //.standalone + uint32_t lo; //.defValue=0 + uint32_t hi; //.defValue=0 + + //+hcode KLASS& operator=(uint64_t rhs); + //+hcode operator uint64_t(); + //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; } + //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); } +}; + +struct BrigAluModifier { //.isroot //.standalone + BrigAluModifier8_t allBits; //.defValue=0 + //^^ bool ftz; //.wtype=BitValRef<0> +}; + +struct BrigBase { //.nowrap + uint16_t byteCount; + BrigKind16_t kind; +}; + +//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE }; +//.alias Directive:Code { //.generic }; +//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND }; + +struct BrigData { + //.nowrap + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigExecutableModifier { //.isroot //.standalone + BrigExecutableModifier8_t allBits; //.defValue=0 + //^^ bool isDefinition; //.wtype=BitValRef<0> +}; + +struct BrigMemoryModifier { //.isroot //.standalone + BrigMemoryModifier8_t allBits; //.defValue=0 + //^^ bool isConst; //.wtype=BitValRef<0> +}; + +struct BrigSegCvtModifier { //.isroot //.standalone + BrigSegCvtModifier8_t allBits; //.defValue=0 + //^^ bool isNoNull; //.wtype=BitValRef<0> +}; + +struct BrigVariableModifier { //.isroot //.standalone + BrigVariableModifier8_t allBits; //.defValue=0 + + //^^ bool isDefinition; //.wtype=BitValRef<0> + //^^ bool isConst; //.wtype=BitValRef<1> +}; + +struct BrigDirectiveArgBlockEnd { + BrigBase base; +}; + +struct BrigDirectiveArgBlockStart { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; //.defValue=0 + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { //.generic + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; //.defValue=0 + uint16_t inArgCount; //.defValue=0 + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier modifier; //.acc=subItem //.wtype=ExecutableModifier + BrigLinkage8_t linkage; + uint16_t reserved; //.defValue=0 +}; + +//.alias DirectiveKernel:DirectiveExecutable { }; +//.alias DirectiveFunction:DirectiveExecutable { }; +//.alias DirectiveSignature:DirectiveExecutable { }; +//.alias DirectiveIndirectFunction:DirectiveExecutable { }; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier modifier; //.acc=subItem //.wtype=VariableModifier + BrigLinkage8_t linkage; + uint16_t reserved; //.defValue=0 +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; //.defValue=1 +}; + +struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + + //+hcode bool isArray(); + //+implcode inline bool KLASS::isArray() { return isArrayType(type()); } + + //+hcode unsigned elementType(); + //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); } + + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; //.acc=subItem //.wtype=UInt64 + BrigVariableModifier modifier; //.acc=subItem //.wtype=VariableModifier + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; //.defValue=0 +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; //.wtype=ValRef + BrigVersion32_t hsailMinor; //.wtype=ValRef + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; + + //+hcode Operand operand(int index); + //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; } +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier modifier; //.acc=subItem //.wtype=AluModifier + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier modifier; //.acc=subItem //.wtype=AluModifier + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; //.defValue=0 +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier modifier; //.acc=subItem //.wtype=MemoryModifier + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier modifier; //.acc=subItem //.wtype=AluModifier + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t imageQuery; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t samplerQuery; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; //.defValue=0 +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier modifier; //.acc=subItem //.wtype=SegCvtModifier +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; //.defValue=0 +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; //.wtype=ItemRef + BrigOperandOffset32_t reg; //.wtype=ItemRef + BrigUInt64 offset; //.acc=subItem //.wtype=UInt64 +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Code elements(int index); + //+implcode inline Code KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; //.defValue=0 + uint16_t reserved; //.defValue=0 + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; //.defValue=0 + BrigDataOffsetOperandList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Operand elements(int index); + //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; //.defValue=0 + BrigUInt64 width; //.acc=subItem //.wtype=UInt64 + BrigUInt64 height; //.acc=subItem //.wtype=UInt64 + BrigUInt64 depth; //.acc=subItem //.wtype=UInt64 + BrigUInt64 array; //.acc=subItem //.wtype=UInt64 +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Operand elements(int index); + //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +//.ignore{ + +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +#define MODULE_IDENTIFICATION_LENGTH (8) + +struct BrigModuleHeader { + char identification[MODULE_IDENTIFICATION_LENGTH]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#endif // defined(INCLUDED_BRIG_H) +//} diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript new file mode 100644 index 000000000..3455823a6 --- /dev/null +++ b/src/arch/hsail/SConscript @@ -0,0 +1,54 @@ +# -*- mode:python -*- + +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +if not env['BUILD_GPU']: + Return() + +if env['TARGET_GPU_ISA'] == 'hsail': + env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'], + 'gen.py', '$SOURCE $TARGETS') + + Source('generic_types.cc') + Source('gpu_decoder.cc') + Source('insts/branch.cc') + Source('insts/gen_exec.cc') + Source('insts/gpu_static_inst.cc') + Source('insts/main.cc') + Source('insts/pseudo_inst.cc') + Source('insts/mem.cc') + Source('operand.cc') diff --git a/src/arch/hsail/SConsopts b/src/arch/hsail/SConsopts new file mode 100644 index 000000000..641963c82 --- /dev/null +++ b/src/arch/hsail/SConsopts @@ -0,0 +1,40 @@ +# -*- mode:python -*- + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +all_gpu_isa_list.append('hsail') diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py new file mode 100755 index 000000000..f2996019b --- /dev/null +++ b/src/arch/hsail/gen.py @@ -0,0 +1,806 @@ +#! /usr/bin/python + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Steve Reinhardt +# + +import sys, re + +from m5.util import code_formatter + +if len(sys.argv) != 4: + print "Error: need 3 args (file names)" + sys.exit(0) + +header_code = code_formatter() +decoder_code = code_formatter() +exec_code = code_formatter() + +############### +# +# Generate file prologs (includes etc.) +# +############### + +header_code(''' +#include "arch/hsail/insts/decl.hh" +#include "base/bitfield.hh" +#include "gpu-compute/hsail_code.hh" +#include "gpu-compute/wavefront.hh" + +namespace HsailISA +{ +''') +header_code.indent() + +decoder_code(''' +#include "arch/hsail/gpu_decoder.hh" +#include "arch/hsail/insts/branch.hh" +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/gen_decl.hh" +#include "arch/hsail/insts/mem.hh" +#include "arch/hsail/insts/mem_impl.hh" +#include "gpu-compute/brig_object.hh" + +namespace HsailISA +{ + std::vector Decoder::decodedInsts; + + GPUStaticInst* + Decoder::decode(MachInst machInst) + { + using namespace Brig; + + const BrigInstBase *ib = machInst.brigInstBase; + const BrigObject *obj = machInst.brigObj; + + switch(ib->opcode) { +''') +decoder_code.indent() +decoder_code.indent() + +exec_code(''' +#include "arch/hsail/insts/gen_decl.hh" +#include "base/intmath.hh" + +namespace HsailISA +{ +''') +exec_code.indent() + +############### +# +# Define code templates for class declarations (for header file) +# +############### + +# Basic header template for an instruction with no template parameters. +header_template_nodt = ''' +class $class_name : public $base_class +{ + public: + typedef $base_class Base; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +# Basic header template for an instruction with a single DataType +# template parameter. +header_template_1dt = ''' +template +class $class_name : public $base_class +{ + public: + typedef $base_class Base; + typedef typename DataType::CType CType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +header_template_1dt_noexec = ''' +template +class $class_name : public $base_class +{ + public: + typedef $base_class Base; + typedef typename DataType::CType CType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } +}; + +''' + +# Same as header_template_1dt, except the base class has a second +# template parameter NumSrcOperands to allow a variable number of +# source operands. Note that since this is implemented with an array, +# it only works for instructions where all sources are of the same +# type (like most arithmetics). +header_template_1dt_varsrcs = ''' +template +class $class_name : public $base_class +{ + public: + typedef $base_class Base; + typedef typename DataType::CType CType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +# Header template for instruction with two DataType template +# parameters, one for the dest and one for the source. This is used +# by compare and convert. +header_template_2dt = ''' +template +class $class_name : public $base_class +{ + public: + typedef $base_class Base; + typedef typename DestDataType::CType DestCType; + typedef typename SrcDataType::CType SrcCType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +header_templates = { + 'ArithInst': header_template_1dt_varsrcs, + 'CmovInst': header_template_1dt, + 'ClassInst': header_template_1dt, + 'ShiftInst': header_template_1dt, + 'ExtractInsertInst': header_template_1dt, + 'CmpInst': header_template_2dt, + 'CvtInst': header_template_2dt, + 'LdInst': '', + 'StInst': '', + 'SpecialInstNoSrc': header_template_nodt, + 'SpecialInst1Src': header_template_nodt, + 'SpecialInstNoSrcNoDest': '', +} + +############### +# +# Define code templates for exec functions +# +############### + +# exec function body +exec_template_nodt_nosrc = ''' +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef Base::DestCType DestCType; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + DestCType dest_val = $expr; + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_nodt_1src = ''' +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef Base::DestCType DestCType; + typedef Base::SrcCType SrcCType; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + SrcCType src_val0 = this->src0.get(w, lane); + DestCType dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_1dt_varsrcs = ''' +template +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType dest_val; + if ($dest_is_src_flag) { + dest_val = this->dest.template get(w, lane); + } + + CType src_val[$num_srcs]; + + for (int i = 0; i < $num_srcs; ++i) { + src_val[i] = this->src[i].template get(w, lane); + } + + dest_val = (CType)($expr); + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_1dt_3srcs = ''' +template +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename Base::Src0CType Src0T; + typedef typename Base::Src1CType Src1T; + typedef typename Base::Src2CType Src2T; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType dest_val; + + if ($dest_is_src_flag) { + dest_val = this->dest.template get(w, lane); + } + + Src0T src_val0 = this->src0.template get(w, lane); + Src1T src_val1 = this->src1.template get(w, lane); + Src2T src_val2 = this->src2.template get(w, lane); + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_1dt_2src_1dest = ''' +template +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename Base::DestCType DestT; + typedef CType Src0T; + typedef typename Base::Src1CType Src1T; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + DestT dest_val; + if ($dest_is_src_flag) { + dest_val = this->dest.template get(w, lane); + } + Src0T src_val0 = this->src0.template get(w, lane); + Src1T src_val1 = this->src1.template get(w, lane); + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_shift = ''' +template +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType dest_val; + + if ($dest_is_src_flag) { + dest_val = this->dest.template get(w, lane); + } + + CType src_val0 = this->src0.template get(w, lane); + uint32_t src_val1 = this->src1.template get(w, lane); + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_2dt = ''' +template +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + DestCType dest_val; + SrcCType src_val[$num_srcs]; + + for (int i = 0; i < $num_srcs; ++i) { + src_val[i] = this->src[i].template get(w, lane); + } + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_templates = { + 'ArithInst': exec_template_1dt_varsrcs, + 'CmovInst': exec_template_1dt_3srcs, + 'ExtractInsertInst': exec_template_1dt_3srcs, + 'ClassInst': exec_template_1dt_2src_1dest, + 'CmpInst': exec_template_2dt, + 'CvtInst': exec_template_2dt, + 'LdInst': '', + 'StInst': '', + 'SpecialInstNoSrc': exec_template_nodt_nosrc, + 'SpecialInst1Src': exec_template_nodt_1src, + 'SpecialInstNoSrcNoDest': '', +} + +############### +# +# Define code templates for the decoder cases +# +############### + +# decode template for nodt-opcode case +decode_nodt_template = ''' + case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);''' + +decode_case_prolog_class_inst = ''' + case BRIG_OPCODE_$brig_opcode_upper: + { + //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]); + BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType; + //switch (baseOp->kind) { + // case BRIG_OPERAND_REG: + // type = ((const BrigOperandReg*)baseOp)->type; + // break; + // case BRIG_OPERAND_IMMED: + // type = ((const BrigOperandImmed*)baseOp)->type; + // break; + // default: + // fatal("CLASS unrecognized kind of operand %d\\n", + // baseOp->kind); + //} + switch (type) {''' + +# common prolog for 1dt- or 2dt-opcode case: switch on data type +decode_case_prolog = ''' + case BRIG_OPCODE_$brig_opcode_upper: + { + switch (ib->type) {''' + +# single-level decode case entry (for 1dt opcodes) +decode_case_entry = \ +' case BRIG_TYPE_$type_name: return $constructor(ib, obj);' + +decode_store_prolog = \ +' case BRIG_TYPE_$type_name: {' + +decode_store_case_epilog = ''' + }''' + +decode_store_case_entry = \ +' return $constructor(ib, obj);' + +# common epilog for type switch +decode_case_epilog = ''' + default: fatal("$brig_opcode_upper: unrecognized type %d\\n", + ib->type); + } + } + break;''' + +# Additional templates for nested decode on a second type field (for +# compare and convert). These are used in place of the +# decode_case_entry template to create a second-level switch on on the +# second type field inside each case of the first-level type switch. +# Because the name and location of the second type can vary, the Brig +# instruction type must be provided in $brig_type, and the name of the +# second type field must be provided in $type_field. +decode_case2_prolog = ''' + case BRIG_TYPE_$type_name: + switch (((Brig$brig_type*)ib)->$type2_field) {''' + +decode_case2_entry = \ +' case BRIG_TYPE_$type2_name: return $constructor(ib, obj);' + +decode_case2_epilog = ''' + default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n", + ((Brig$brig_type*)ib)->$type2_field); + } + break;''' + +# Figure out how many source operands an expr needs by looking for the +# highest-numbered srcN value referenced. Since sources are numbered +# starting at 0, the return value is N+1. +def num_src_operands(expr): + if expr.find('src2') != -1: + return 3 + elif expr.find('src1') != -1: + return 2 + elif expr.find('src0') != -1: + return 1 + else: + return 0 + +############### +# +# Define final code generation methods +# +# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for +# generating actual instructions. +# +############### + +# Generate class declaration, exec function, and decode switch case +# for an brig_opcode with a single-level type switch. The 'types' +# parameter is a list or tuple of types for which the instruction +# should be instantiated. +def gen(brig_opcode, types=None, expr=None, base_class='ArithInst', + type2_info=None, constructor_prefix='new ', is_store=False): + brig_opcode_upper = brig_opcode.upper() + class_name = brig_opcode + opcode = class_name.lower() + + if base_class == 'ArithInst': + # note that expr must be provided with ArithInst so we can + # derive num_srcs for the template + assert expr + + if expr: + # Derive several bits of info from expr. If expr is not used, + # this info will be irrelevant. + num_srcs = num_src_operands(expr) + # if the RHS expression includes 'dest', then we're doing an RMW + # on the reg and we need to treat it like a source + dest_is_src = expr.find('dest') != -1 + dest_is_src_flag = str(dest_is_src).lower() # for C++ + if base_class in ['ShiftInst']: + expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr) + elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']: + expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr) + else: + expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr) + expr = re.sub(r'\bdest\b', r'dest_val', expr) + + # Strip template arguments off of base class before looking up + # appropriate templates + base_class_base = re.sub(r'<.*>$', '', base_class) + header_code(header_templates[base_class_base]) + + if base_class.startswith('SpecialInst'): + exec_code(exec_templates[base_class_base]) + elif base_class.startswith('ShiftInst'): + header_code(exec_template_shift) + else: + header_code(exec_templates[base_class_base]) + + if not types or isinstance(types, str): + # Just a single type + constructor = constructor_prefix + class_name + decoder_code(decode_nodt_template) + else: + # multiple types, need at least one level of decode + if brig_opcode == 'Class': + decoder_code(decode_case_prolog_class_inst) + else: + decoder_code(decode_case_prolog) + if not type2_info: + if is_store == False: + # single list of types, to basic one-level decode + for type_name in types: + full_class_name = '%s<%s>' % (class_name, type_name.upper()) + constructor = constructor_prefix + full_class_name + decoder_code(decode_case_entry) + else: + # single list of types, to basic one-level decode + for type_name in types: + decoder_code(decode_store_prolog) + type_size = int(re.findall(r'[0-9]+', type_name)[0]) + src_size = 32 + type_type = type_name[0] + full_class_name = '%s<%s,%s>' % (class_name, \ + type_name.upper(), \ + '%s%d' % \ + (type_type.upper(), \ + type_size)) + constructor = constructor_prefix + full_class_name + decoder_code(decode_store_case_entry) + decoder_code(decode_store_case_epilog) + else: + # need secondary type switch (convert, compare) + # unpack extra info on second switch + (type2_field, types2) = type2_info + brig_type = 'Inst%s' % brig_opcode + for type_name in types: + decoder_code(decode_case2_prolog) + fmt = '%s<%s,%%s>' % (class_name, type_name.upper()) + for type2_name in types2: + full_class_name = fmt % type2_name.upper() + constructor = constructor_prefix + full_class_name + decoder_code(decode_case2_entry) + + decoder_code(decode_case2_epilog) + + decoder_code(decode_case_epilog) + +############### +# +# Generate instructions +# +############### + +# handy abbreviations for common sets of types + +# arithmetic ops are typically defined only on 32- and 64-bit sizes +arith_int_types = ('S32', 'U32', 'S64', 'U64') +arith_float_types = ('F32', 'F64') +arith_types = arith_int_types + arith_float_types + +bit_types = ('B1', 'B32', 'B64') + +all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types + +# I think you might be able to do 'f16' memory ops too, but we'll +# ignore them for now. +mem_types = all_int_types + arith_float_types +mem_atom_types = all_int_types + ('B32', 'B64') + +##### Arithmetic & logical operations +gen('Add', arith_types, 'src0 + src1') +gen('Sub', arith_types, 'src0 - src1') +gen('Mul', arith_types, 'src0 * src1') +gen('Div', arith_types, 'src0 / src1') +gen('Min', arith_types, 'std::min(src0, src1)') +gen('Max', arith_types, 'std::max(src0, src1)') +gen('Gcnmin', arith_types, 'std::min(src0, src1)') + +gen('CopySign', arith_float_types, + 'src1 < 0 ? -std::abs(src0) : std::abs(src0)') +gen('Sqrt', arith_float_types, 'sqrt(src0)') +gen('Floor', arith_float_types, 'floor(src0)') + +# "fast" sqrt... same as slow for us +gen('Nsqrt', arith_float_types, 'sqrt(src0)') +gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)') +gen('Nrcp', arith_float_types, '1.0/src0') +gen('Fract', arith_float_types, + '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)') + +gen('Ncos', arith_float_types, 'cos(src0)'); +gen('Nsin', arith_float_types, 'sin(src0)'); + +gen('And', bit_types, 'src0 & src1') +gen('Or', bit_types, 'src0 | src1') +gen('Xor', bit_types, 'src0 ^ src1') + +gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)') +gen('Firstbit',bit_types, 'firstbit(src0)') +gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)') + +gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst') +gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst') + +# gen('Mul_hi', types=('s32','u32', '??')) +# gen('Mul24', types=('s32','u32', '??')) +gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)') + +gen('Abs', arith_types, 'std::abs(src0)') +gen('Neg', arith_types, '-src0') + +gen('Mov', bit_types, 'src0') +gen('Not', bit_types, 'heynot(src0)') + +# mad and fma differ only in rounding behavior, which we don't emulate +# also there's an integer form of mad, but not of fma +gen('Mad', arith_types, 'src0 * src1 + src2') +gen('Fma', arith_float_types, 'src0 * src1 + src2') + +#native floating point operations +gen('Nfma', arith_float_types, 'src0 * src1 + src2') + +gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst') +gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))') +gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))') + +# see base/bitfield.hh +gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)', + 'ExtractInsertInst') + +gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)', + 'ExtractInsertInst') + +##### Compare +gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)', + 'CmpInst', ('sourceType', arith_types + bit_types)) +gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst') + +##### Conversion + +# Conversion operations are only defined on B1, not B32 or B64 +cvt_types = ('B1',) + mem_types + +gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types)) + + +##### Load & Store +gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode') +gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode') +gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode', + is_store=True) +gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode') +gen('AtomicNoRet', mem_atom_types, base_class='StInst', + constructor_prefix='decode') + +gen('Cbr', base_class = 'LdInst', constructor_prefix='decode') +gen('Br', base_class = 'LdInst', constructor_prefix='decode') + +##### Special operations +def gen_special(brig_opcode, expr, dest_type='U32'): + num_srcs = num_src_operands(expr) + if num_srcs == 0: + base_class = 'SpecialInstNoSrc<%s>' % dest_type + elif num_srcs == 1: + base_class = 'SpecialInst1Src<%s>' % dest_type + else: + assert false + + gen(brig_opcode, None, expr, base_class) + +gen_special('WorkItemId', 'w->workitemid[src0][lane]') +gen_special('WorkItemAbsId', + 'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])') +gen_special('WorkGroupId', 'w->workgroupid[src0]') +gen_special('WorkGroupSize', 'w->workgroupsz[src0]') +gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]') +gen_special('GridSize', 'w->gridsz[src0]') +gen_special('GridGroups', + 'divCeil(w->gridsz[src0],w->workgroupsz[src0])') +gen_special('LaneId', 'lane') +gen_special('WaveId', 'w->dynwaveid') +gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64') + +# gen_special('CU'', ') + +gen('Ret', base_class='SpecialInstNoSrcNoDest') +gen('Barrier', base_class='SpecialInstNoSrcNoDest') +gen('MemFence', base_class='SpecialInstNoSrcNoDest') + +# Map magic instructions to the BrigSyscall opcode +# Magic instructions are defined in magic.hh +# +# In the future, real HSA kernel system calls can be implemented and coexist +# with magic instructions. +gen('Call', base_class='SpecialInstNoSrcNoDest') + +############### +# +# Generate file epilogs +# +############### +header_code.dedent() +header_code(''' +} // namespace HsailISA +''') + +# close off main decode switch +decoder_code.dedent() +decoder_code.dedent() +decoder_code(''' + default: fatal("unrecognized Brig opcode %d\\n", ib->opcode); + } // end switch(ib->opcode) + } // end decode() +} // namespace HsailISA +''') + +exec_code.dedent() +exec_code(''' +} // namespace HsailISA +''') + +############### +# +# Output accumulated code to files +# +############### +header_code.write(sys.argv[1]) +decoder_code.write(sys.argv[2]) +exec_code.write(sys.argv[3]) diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc new file mode 100644 index 000000000..0cd55d1d5 --- /dev/null +++ b/src/arch/hsail/generic_types.cc @@ -0,0 +1,47 @@ +#include "arch/hsail/generic_types.hh" +#include "base/misc.hh" + +using namespace Brig; + +namespace HsailISA +{ + Enums::GenericMemoryOrder + getGenericMemoryOrder(BrigMemoryOrder brig_memory_order) + { + switch(brig_memory_order) { + case BRIG_MEMORY_ORDER_NONE: + return Enums::MEMORY_ORDER_NONE; + case BRIG_MEMORY_ORDER_RELAXED: + return Enums::MEMORY_ORDER_RELAXED; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + return Enums::MEMORY_ORDER_SC_ACQUIRE; + case BRIG_MEMORY_ORDER_SC_RELEASE: + return Enums::MEMORY_ORDER_SC_RELEASE; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE; + default: + fatal("HsailISA::MemInst::getGenericMemoryOrder -> ", + "bad BrigMemoryOrder\n"); + } + } + + Enums::GenericMemoryScope + getGenericMemoryScope(BrigMemoryScope brig_memory_scope) + { + switch(brig_memory_scope) { + case BRIG_MEMORY_SCOPE_NONE: + return Enums::MEMORY_SCOPE_NONE; + case BRIG_MEMORY_SCOPE_WORKITEM: + return Enums::MEMORY_SCOPE_WORKITEM; + case BRIG_MEMORY_SCOPE_WORKGROUP: + return Enums::MEMORY_SCOPE_WORKGROUP; + case BRIG_MEMORY_SCOPE_AGENT: + return Enums::MEMORY_SCOPE_DEVICE; + case BRIG_MEMORY_SCOPE_SYSTEM: + return Enums::MEMORY_SCOPE_SYSTEM; + default: + fatal("HsailISA::MemInst::getGenericMemoryScope -> ", + "bad BrigMemoryScope\n"); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh new file mode 100644 index 000000000..50e430bef --- /dev/null +++ b/src/arch/hsail/generic_types.hh @@ -0,0 +1,16 @@ +#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__ +#define __ARCH_HSAIL_GENERIC_TYPES_HH__ + +#include "arch/hsail/Brig.h" +#include "enums/GenericMemoryOrder.hh" +#include "enums/GenericMemoryScope.hh" + +namespace HsailISA +{ + Enums::GenericMemoryOrder + getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order); + Enums::GenericMemoryScope + getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope); +} // namespace HsailISA + +#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__ diff --git a/src/arch/hsail/gpu_decoder.hh b/src/arch/hsail/gpu_decoder.hh new file mode 100644 index 000000000..98a689664 --- /dev/null +++ b/src/arch/hsail/gpu_decoder.hh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_GPU_DECODER_HH__ +#define __ARCH_HSAIL_GPU_DECODER_HH__ + +#include + +#include "arch/hsail/gpu_types.hh" + +class BrigObject; +class GPUStaticInst; + +namespace Brig +{ + class BrigInstBase; +} + +namespace HsailISA +{ + class Decoder + { + public: + GPUStaticInst* decode(MachInst machInst); + + GPUStaticInst* + decode(RawMachInst inst) + { + return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr; + } + + RawMachInst + saveInst(GPUStaticInst *decodedInst) + { + decodedInsts.push_back(decodedInst); + + return decodedInsts.size() - 1; + } + + private: + static std::vector decodedInsts; + }; +} // namespace HsailISA + +#endif // __ARCH_HSAIL_GPU_DECODER_HH__ diff --git a/src/arch/hsail/gpu_types.hh b/src/arch/hsail/gpu_types.hh new file mode 100644 index 000000000..4b3a66a9a --- /dev/null +++ b/src/arch/hsail/gpu_types.hh @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_GPU_TYPES_HH__ +#define __ARCH_HSAIL_GPU_TYPES_HH__ + +#include + +namespace Brig +{ + class BrigInstBase; +} + +class BrigObject; + +namespace HsailISA +{ + // A raw machine instruction represents the raw bits that + // our model uses to represent an actual instruction. In + // the case of HSAIL this is just an index into a list of + // instruction objects. + typedef uint64_t RawMachInst; + + // The MachInst is a representation of an instruction + // that has more information than just the machine code. + // For HSAIL the actual machine code is a BrigInstBase + // and the BrigObject contains more pertinent + // information related to operaands, etc. + + struct MachInst + { + const Brig::BrigInstBase *brigInstBase; + const BrigObject *brigObj; + }; +} + +#endif // __ARCH_HSAIL_GPU_TYPES_HH__ diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc new file mode 100644 index 000000000..d65279cc8 --- /dev/null +++ b/src/arch/hsail/insts/branch.cc @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "arch/hsail/insts/branch.hh" + +#include "gpu-compute/hsail_code.hh" + +namespace HsailISA +{ + GPUStaticInst* + decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // register operand. + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new BrnIndirectInst(ib, obj); + } else { + return new BrnDirectInst(ib, obj); + } + } + + GPUStaticInst* + decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // second register operand (after the condition). + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new CbrIndirectInst(ib, obj); + } else { + return new CbrDirectInst(ib, obj); + } + } + + GPUStaticInst* + decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // second register operand (after the condition). + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new BrIndirectInst(ib, obj); + } else { + return new BrDirectInst(ib, obj); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh new file mode 100644 index 000000000..54ad9a042 --- /dev/null +++ b/src/arch/hsail/insts/branch.hh @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__ +#define __ARCH_HSAIL_INSTS_BRANCH_HH__ + +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/wavefront.hh" + +namespace HsailISA +{ + + // The main difference between a direct branch and an indirect branch + // is whether the target is a register or a label, so we can share a + // lot of code if we template the base implementation on that type. + template + class BrnInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + Brig::BrigWidth8_t width; + TargetType target; + + BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "brn") + { + o_type = Enums::OT_BRANCH; + width = ((Brig::BrigInstBr*)ib)->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + bool unconditionalJumpInstruction() override { return true; } + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isScalarRegister(); + } + + bool isSrcOperand(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return true; + } + + bool isDstOperand(int operandIndex) { + return false; + } + + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.opSize(); + } + + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.regIndex(); + } + + int getNumOperands() { + return 1; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + template + void + BrnInstBase::generateDisassembly() + { + std::string widthClause; + + if (width != 1) { + widthClause = csprintf("_width(%d)", width); + } + + disassembly = csprintf("%s%s %s", opcode, widthClause, + target.disassemble()); + } + + template + void + BrnInstBase::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + if (getTargetPc() == w->rpc()) { + w->popFromReconvergenceStack(); + } else { + // Rpc and execution mask remain the same + w->pc(getTargetPc()); + } + w->discardFetch(); + } + + class BrnDirectInst : public BrnInstBase + { + public: + BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrnInstBase(ib, obj) + { + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class BrnIndirectInst : public BrnInstBase + { + public: + BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrnInstBase(ib, obj) + { + } + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib, + const BrigObject *obj); + + template + class CbrInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + Brig::BrigWidth8_t width; + CRegOperand cond; + TargetType target; + + CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "cbr") + { + o_type = Enums::OT_BRANCH; + width = ((Brig::BrigInstBr *)ib)->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + cond.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + void execute(GPUDynInstPtr gpuDynInst); + // Assumption: Target is operand 0, Condition Register is operand 1 + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.isVectorRegister(); + else + return false; + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.isCondRegister(); + else + return true; + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return target.isScalarRegister(); + else + return false; + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == 0) + return true; + return false; + } + // both Condition Register and Target are source operands + bool isDstOperand(int operandIndex) { + return false; + } + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.opSize(); + else + return 1; + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.regIndex(); + else + return -1; + } + + // Operands = Target, Condition Register + int getNumOperands() { + return 2; + } + }; + + template + void + CbrInstBase::generateDisassembly() + { + std::string widthClause; + + if (width != 1) { + widthClause = csprintf("_width(%d)", width); + } + + disassembly = csprintf("%s%s %s,%s", opcode, widthClause, + cond.disassemble(), target.disassemble()); + } + + template + void + CbrInstBase::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + const uint32_t curr_pc = w->pc(); + const uint32_t curr_rpc = w->rpc(); + const VectorMask curr_mask = w->execMask(); + + /** + * TODO: can we move this pop outside the instruction, and + * into the wavefront? + */ + w->popFromReconvergenceStack(); + + // immediate post-dominator instruction + const uint32_t rpc = static_cast(ipdInstNum()); + if (curr_rpc != rpc) { + w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask); + } + + // taken branch + const uint32_t true_pc = getTargetPc(); + VectorMask true_mask; + for (unsigned int lane = 0; lane < VSZ; ++lane) { + true_mask[lane] = cond.get(w, lane) & curr_mask[lane]; + } + + // not taken branch + const uint32_t false_pc = curr_pc + 1; + assert(true_pc != false_pc); + if (false_pc != rpc && true_mask.count() < curr_mask.count()) { + VectorMask false_mask = curr_mask & ~true_mask; + w->pushToReconvergenceStack(false_pc, rpc, false_mask); + } + + if (true_pc != rpc && true_mask.count()) { + w->pushToReconvergenceStack(true_pc, rpc, true_mask); + } + assert(w->pc() != curr_pc); + w->discardFetch(); + } + + + class CbrDirectInst : public CbrInstBase + { + public: + CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : CbrInstBase(ib, obj) + { + } + // the source operand of a conditional branch is a Condition + // Register which is not stored in the VRF + // so we do not count it as a source-register operand + // even though, formally, it is one. + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class CbrIndirectInst : public CbrInstBase + { + public: + CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : CbrInstBase(ib, obj) + { + } + // one source operand of the conditional indirect branch is a Condition + // register which is not stored in the VRF so we do not count it + // as a source-register operand even though, formally, it is one. + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib, + const BrigObject *obj); + + template + class BrInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + ImmOperand width; + TargetType target; + + BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "br") + { + o_type = Enums::OT_BRANCH; + width.init(((Brig::BrigInstBr *)ib)->width, obj); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + bool unconditionalJumpInstruction() override { return true; } + + void execute(GPUDynInstPtr gpuDynInst); + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.regIndex(); + } + int getNumOperands() { return 1; } + }; + + template + void + BrInstBase::generateDisassembly() + { + std::string widthClause; + + if (width.bits != 1) { + widthClause = csprintf("_width(%d)", width.bits); + } + + disassembly = csprintf("%s%s %s", opcode, widthClause, + target.disassemble()); + } + + template + void + BrInstBase::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + if (getTargetPc() == w->rpc()) { + w->popFromReconvergenceStack(); + } else { + // Rpc and execution mask remain the same + w->pc(getTargetPc()); + } + w->discardFetch(); + } + + class BrDirectInst : public BrInstBase + { + public: + BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrInstBase(ib, obj) + { + } + + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class BrIndirectInst : public BrInstBase + { + public: + BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrInstBase(ib, obj) + { + } + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib, + const BrigObject *obj); +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__ diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh new file mode 100644 index 000000000..e2da501b9 --- /dev/null +++ b/src/arch/hsail/insts/decl.hh @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_DECL_HH__ +#define __ARCH_HSAIL_INSTS_DECL_HH__ + +#include + +#include "arch/hsail/generic_types.hh" +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" +#include "debug/HSAIL.hh" +#include "enums/OpType.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +namespace HsailISA +{ + template + class HsailOperandType + { + public: + typedef _DestOperand DestOperand; + typedef _SrcOperand SrcOperand; + }; + + typedef HsailOperandType CRegOperandType; + typedef HsailOperandType SRegOperandType; + typedef HsailOperandType DRegOperandType; + + // The IsBits parameter serves only to disambiguate tbhe B* types from + // the U* types, which otherwise would be identical (and + // indistinguishable). + template + class HsailDataType + { + public: + typedef _OperandType OperandType; + typedef _CType CType; + static const Enums::MemType memType = _memType; + static const vgpr_type vgprType = _vgprType; + static const char *label; + }; + + typedef HsailDataType B1; + typedef HsailDataType B8; + + typedef HsailDataType B16; + + typedef HsailDataType B32; + + typedef HsailDataType B64; + + typedef HsailDataType S8; + typedef HsailDataType S16; + typedef HsailDataType S32; + typedef HsailDataType S64; + + typedef HsailDataType U8; + typedef HsailDataType U16; + typedef HsailDataType U32; + typedef HsailDataType U64; + + typedef HsailDataType F32; + typedef HsailDataType F64; + + template + class CommonInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename SrcOperandType::SrcOperand src[NumSrcOperands]; + + void + generateDisassembly() + { + disassembly = csprintf("%s%s %s", opcode, opcode_suffix(), + dest.disassemble()); + + for (int i = 0; i < NumSrcOperands; ++i) { + disassembly += ","; + disassembly += src[i].disassemble(); + } + } + + virtual std::string opcode_suffix() = 0; + + public: + CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + + dest.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 1); + src[i].init(op_offs, obj); + } + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return true; + return false; + } + + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= NumSrcOperands) + return true; + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + + if (operandIndex < NumSrcOperands) + return src[operandIndex].regIndex(); + else + return dest.regIndex(); + } + int numSrcRegOperands() { + int operands = 0; + for (int i = 0; i < NumSrcOperands; i++) { + if (src[i].isVectorRegister() == true) { + operands++; + } + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return NumSrcOperands + 1; } + }; + + template + class ArithInst : public CommonInstBase + { + public: + std::string opcode_suffix() { return csprintf("_%s", DataType::label); } + + ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : CommonInstBase(ib, obj, opcode) + { + } + }; + + template + class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename Src0OperandType::SrcOperand src0; + typename Src1OperandType::SrcOperand src1; + typename Src2OperandType::SrcOperand src2; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(), + src0.disassemble(), src1.disassemble(), + src2.disassemble()); + } + + public: + ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, + const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 3); + src2.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isVectorRegister(); + else if (operandIndex == 1) + return src1.isVectorRegister(); + else if (operandIndex == 2) + return src2.isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isCondRegister(); + else if (operandIndex == 1) + return src1.isCondRegister(); + else if (operandIndex == 2) + return src2.isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isScalarRegister(); + else if (operandIndex == 1) + return src1.isScalarRegister(); + else if (operandIndex == 2) + return src2.isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < 3) + return true; + else + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= 3) + return true; + else + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.opSize(); + else if (operandIndex == 1) + return src1.opSize(); + else if (operandIndex == 2) + return src2.opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.regIndex(); + else if (operandIndex == 1) + return src1.regIndex(); + else if (operandIndex == 2) + return src2.regIndex(); + else + return dest.regIndex(); + } + + int numSrcRegOperands() { + int operands = 0; + if (src0.isVectorRegister() == true) { + operands++; + } + if (src1.isVectorRegister() == true) { + operands++; + } + if (src2.isVectorRegister() == true) { + operands++; + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 4; } + }; + + template + class ThreeNonUniformSourceInst : + public ThreeNonUniformSourceInstBase + { + public: + typedef typename DestDataType::CType DestCType; + typedef typename Src0DataType::CType Src0CType; + typedef typename Src1DataType::CType Src1CType; + typedef typename Src2DataType::CType Src2CType; + + ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : ThreeNonUniformSourceInstBase(ib, + obj, opcode) + { + } + }; + + template + class CmovInst : public ThreeNonUniformSourceInst + { + public: + CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : ThreeNonUniformSourceInst(ib, obj, opcode) + { + } + }; + + template + class ExtractInsertInst : public ThreeNonUniformSourceInst + { + public: + ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : ThreeNonUniformSourceInst(ib, obj, opcode) + { + } + }; + + template + class TwoNonUniformSourceInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename Src0OperandType::SrcOperand src0; + typename Src1OperandType::SrcOperand src1; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(), + src0.disassemble(), src1.disassemble()); + } + + + public: + TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + } + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isVectorRegister(); + else if (operandIndex == 1) + return src1.isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isCondRegister(); + else if (operandIndex == 1) + return src1.isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isScalarRegister(); + else if (operandIndex == 1) + return src1.isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < 2) + return true; + else + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= 2) + return true; + else + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.opSize(); + else if (operandIndex == 1) + return src1.opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.regIndex(); + else if (operandIndex == 1) + return src1.regIndex(); + else + return dest.regIndex(); + } + + int numSrcRegOperands() { + int operands = 0; + if (src0.isVectorRegister() == true) { + operands++; + } + if (src1.isVectorRegister() == true) { + operands++; + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 3; } + }; + + template + class TwoNonUniformSourceInst : + public TwoNonUniformSourceInstBase + { + public: + typedef typename DestDataType::CType DestCType; + typedef typename Src0DataType::CType Src0CType; + typedef typename Src1DataType::CType Src1CType; + + TwoNonUniformSourceInst(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : TwoNonUniformSourceInstBase(ib, + obj, opcode) + { + } + }; + + // helper function for ClassInst + template + bool + fpclassify(T src0, uint32_t src1) + { + int fpclass = std::fpclassify(src0); + + if ((src1 & 0x3) && (fpclass == FP_NAN)) { + return true; + } + + if (src0 <= -0.0) { + if ((src1 & 0x4) && fpclass == FP_INFINITE) + return true; + if ((src1 & 0x8) && fpclass == FP_NORMAL) + return true; + if ((src1 & 0x10) && fpclass == FP_SUBNORMAL) + return true; + if ((src1 & 0x20) && fpclass == FP_ZERO) + return true; + } else { + if ((src1 & 0x40) && fpclass == FP_ZERO) + return true; + if ((src1 & 0x80) && fpclass == FP_SUBNORMAL) + return true; + if ((src1 & 0x100) && fpclass == FP_NORMAL) + return true; + if ((src1 & 0x200) && fpclass == FP_INFINITE) + return true; + } + return false; + } + + template + class ClassInst : public TwoNonUniformSourceInst + { + public: + ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : TwoNonUniformSourceInst(ib, obj, opcode) + { + } + }; + + template + class ShiftInst : public TwoNonUniformSourceInst + { + public: + ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : TwoNonUniformSourceInst(ib, obj, opcode) + { + } + }; + + // helper function for CmpInst + template + bool + compare(T src0, T src1, Brig::BrigCompareOperation cmpOp) + { + using namespace Brig; + + switch (cmpOp) { + case BRIG_COMPARE_EQ: + case BRIG_COMPARE_EQU: + case BRIG_COMPARE_SEQ: + case BRIG_COMPARE_SEQU: + return (src0 == src1); + + case BRIG_COMPARE_NE: + case BRIG_COMPARE_NEU: + case BRIG_COMPARE_SNE: + case BRIG_COMPARE_SNEU: + return (src0 != src1); + + case BRIG_COMPARE_LT: + case BRIG_COMPARE_LTU: + case BRIG_COMPARE_SLT: + case BRIG_COMPARE_SLTU: + return (src0 < src1); + + case BRIG_COMPARE_LE: + case BRIG_COMPARE_LEU: + case BRIG_COMPARE_SLE: + case BRIG_COMPARE_SLEU: + return (src0 <= src1); + + case BRIG_COMPARE_GT: + case BRIG_COMPARE_GTU: + case BRIG_COMPARE_SGT: + case BRIG_COMPARE_SGTU: + return (src0 > src1); + + case BRIG_COMPARE_GE: + case BRIG_COMPARE_GEU: + case BRIG_COMPARE_SGE: + case BRIG_COMPARE_SGEU: + return (src0 >= src1); + + case BRIG_COMPARE_NUM: + case BRIG_COMPARE_SNUM: + return (src0 == src0) || (src1 == src1); + + case BRIG_COMPARE_NAN: + case BRIG_COMPARE_SNAN: + return (src0 != src0) || (src1 != src1); + + default: + fatal("Bad cmpOp value %d\n", (int)cmpOp); + } + } + + template + int32_t + firstbit(T src0) + { + if (!src0) + return -1; + + //handle positive and negative numbers + T tmp = (src0 < 0) ? (~src0) : (src0); + + //the starting pos is MSB + int pos = 8 * sizeof(T) - 1; + int cnt = 0; + + //search the first bit set to 1 + while (!(tmp & (1 << pos))) { + ++cnt; + --pos; + } + return cnt; + } + + const char* cmpOpToString(Brig::BrigCompareOperation cmpOp); + + template + class CmpInstBase : public CommonInstBase + { + protected: + Brig::BrigCompareOperation cmpOp; + + public: + CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CommonInstBase(ib, obj, + _opcode) + { + assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP); + Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib; + cmpOp = (Brig::BrigCompareOperation)i->compare; + } + }; + + template + class CmpInst : public CmpInstBase + { + public: + std::string + opcode_suffix() + { + return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp), + DestDataType::label, SrcDataType::label); + } + + CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CmpInstBase(ib, obj, _opcode) + { + } + }; + + template + class CvtInst : public CommonInstBase + { + public: + std::string opcode_suffix() + { + return csprintf("_%s_%s", DestDataType::label, SrcDataType::label); + } + + CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CommonInstBase(ib, obj, _opcode) + { + } + }; + + class SpecialInstNoSrcNoDest : public HsailGPUStaticInst + { + public: + SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + } + + bool isVectorRegister(int operandIndex) { return false; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return -1; } + + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + int getNumOperands() { return 0; } + }; + + template + class SpecialInstNoSrcBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + + void generateDisassembly() + { + disassembly = csprintf("%s %s", opcode, dest.disassemble()); + } + + public: + SpecialInstNoSrcBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return true; } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.regIndex(); + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 1; } + }; + + template + class SpecialInstNoSrc : + public SpecialInstNoSrcBase + { + public: + typedef typename DestDataType::CType DestCType; + + SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : SpecialInstNoSrcBase(ib, obj, + _opcode) + { + } + }; + + template + class SpecialInst1SrcBase : public HsailGPUStaticInst + { + protected: + typedef int SrcCType; // used in execute() template + + typename DestOperandType::DestOperand dest; + ImmOperand src0; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(), + src0.disassemble()); + } + + public: + SpecialInst1SrcBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + } + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return true; } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.regIndex(); + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 1; } + }; + + template + class SpecialInst1Src : + public SpecialInst1SrcBase + { + public: + typedef typename DestDataType::CType DestCType; + + SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : SpecialInst1SrcBase(ib, obj, + _opcode) + { + } + }; + + class Ret : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + + Ret(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "ret") + { + o_type = Enums::OT_RET; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + class Barrier : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + uint8_t width; + + Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "barrier") + { + o_type = Enums::OT_BARRIER; + assert(ib->base.kind == Brig::BRIG_KIND_INST_BR); + width = (uint8_t)((Brig::BrigInstBr*)ib)->width; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + class MemFence : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + + Brig::BrigMemoryOrder memFenceMemOrder; + Brig::BrigMemoryScope memFenceScopeSegGroup; + Brig::BrigMemoryScope memFenceScopeSegGlobal; + Brig::BrigMemoryScope memFenceScopeSegImage; + + MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "memfence") + { + assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE); + + memFenceScopeSegGlobal = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope; + + memFenceScopeSegGroup = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope; + + memFenceScopeSegImage = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope; + + memFenceMemOrder = (Brig::BrigMemoryOrder) + ((Brig::BrigInstMemFence*)ib)->memoryOrder; + + // set o_type based on scopes + if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE && + memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_BOTH_MEMFENCE; + } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_GLOBAL_MEMFENCE; + } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_SHARED_MEMFENCE; + } else { + fatal("MemFence constructor: bad scope specifiers\n"); + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) + { + Wavefront *wave = gpuDynInst->wavefront(); + wave->computeUnit->injectGlobalMemFence(gpuDynInst); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + // 2 cases: + // * memfence to a sequentially consistent memory (e.g., LDS). + // These can be handled as no-ops. + // * memfence to a relaxed consistency cache (e.g., Hermes, Viper, + // etc.). We send a packet, tagged with the memory order and + // scope, and let the GPU coalescer handle it. + + if (o_type == Enums::OT_GLOBAL_MEMFENCE || + o_type == Enums::OT_BOTH_MEMFENCE) { + gpuDynInst->simdId = w->simdId; + gpuDynInst->wfSlotId = w->wfSlotId; + gpuDynInst->wfDynId = w->wfDynId; + gpuDynInst->kern_id = w->kern_id; + gpuDynInst->cu_id = w->computeUnit->cu_id; + + gpuDynInst->memoryOrder = + getGenericMemoryOrder(memFenceMemOrder); + gpuDynInst->scope = + getGenericMemoryScope(memFenceScopeSegGlobal); + gpuDynInst->useContinuation = false; + GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); + gmp->getGMReqFIFO().push(gpuDynInst); + + w->wr_gm_reqs_in_pipe--; + w->rd_gm_reqs_in_pipe--; + w->mem_reqs_in_pipe--; + w->outstanding_reqs++; + } else if (o_type == Enums::OT_SHARED_MEMFENCE) { + // no-op + } else { + fatal("MemFence execute: bad o_type\n"); + } + } + }; + + class Call : public HsailGPUStaticInst + { + public: + // private helper functions + void calcAddr(Wavefront* w, GPUDynInstPtr m); + + void + generateDisassembly() + { + if (dest.disassemble() == "") { + disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(), + src1.disassemble()); + } else { + disassembly = csprintf("%s %s (%s) (%s)", opcode, + src0.disassemble(), dest.disassemble(), + src1.disassemble()); + } + } + + bool + isPseudoOp() + { + std::string func_name = src0.disassemble(); + if (func_name.find("__gem5_hsail_op") != std::string::npos) { + return true; + } + return false; + } + + // member variables + ListOperand dest; + FunctionRefOperand src0; + ListOperand src1; + HsailCode *func_ptr; + + // exec function for pseudo instructions mapped on top of call opcode + void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst); + + // user-defined pseudo instructions + void MagicPrintLane(Wavefront *w); + void MagicPrintLane64(Wavefront *w); + void MagicPrintWF32(Wavefront *w); + void MagicPrintWF64(Wavefront *w); + void MagicPrintWFFloat(Wavefront *w); + void MagicSimBreak(Wavefront *w); + void MagicPrefixSum(Wavefront *w); + void MagicReduction(Wavefront *w); + void MagicMaskLower(Wavefront *w); + void MagicMaskUpper(Wavefront *w); + void MagicJoinWFBar(Wavefront *w); + void MagicWaitWFBar(Wavefront *w); + void MagicPanic(Wavefront *w); + + void MagicAtomicNRAddGlobalU32Reg(Wavefront *w, + GPUDynInstPtr gpuDynInst); + + void MagicAtomicNRAddGroupU32Reg(Wavefront *w, + GPUDynInstPtr gpuDynInst); + + void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst); + + void MagicXactCasLd(Wavefront *w); + void MagicMostSigThread(Wavefront *w); + void MagicMostSigBroadcast(Wavefront *w); + + void MagicPrintWF32ID(Wavefront *w); + void MagicPrintWFID64(Wavefront *w); + + Call(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "call") + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + func_ptr = nullptr; + std::string func_name = src0.disassemble(); + if (!isPseudoOp()) { + func_ptr = dynamic_cast(obj-> + getFunction(func_name)); + + if (!func_ptr) + fatal("call::exec cannot find function: %s\n", func_name); + } + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { return false; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return -1; } + + void + execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + std::string func_name = src0.disassemble(); + if (isPseudoOp()) { + execPseudoInst(w, gpuDynInst); + } else { + fatal("Native HSAIL functions are not yet implemented: %s\n", + func_name); + } + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + int getNumOperands() { return 2; } + }; + + template T heynot(T arg) { return ~arg; } + template<> inline bool heynot(bool arg) { return !arg; } +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_DECL_HH__ diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc new file mode 100644 index 000000000..bbaeb13e6 --- /dev/null +++ b/src/arch/hsail/insts/gpu_static_inst.cc @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "arch/hsail/insts/gpu_static_inst.hh" + +#include "gpu-compute/brig_object.hh" + +namespace HsailISA +{ + HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj, + const std::string &opcode) + : GPUStaticInst(opcode), hsailCode(obj->currentCode) + { + } + + void + HsailGPUStaticInst::generateDisassembly() + { + disassembly = opcode; + } + + const std::string& + HsailGPUStaticInst::disassemble() + { + if (disassembly.empty()) { + generateDisassembly(); + assert(!disassembly.empty()); + } + + return disassembly; + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh new file mode 100644 index 000000000..29aab1f70 --- /dev/null +++ b/src/arch/hsail/insts/gpu_static_inst.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ +#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ + +/* + * @file gpu_static_inst.hh + * + * Defines the base class representing HSAIL GPU static instructions. + */ + +#include "gpu-compute/gpu_static_inst.hh" + +class BrigObject; +class HsailCode; + +namespace HsailISA +{ + class HsailGPUStaticInst : public GPUStaticInst + { + public: + HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode); + void generateDisassembly(); + const std::string &disassemble(); + uint32_t instSize() { return 4; } + + protected: + HsailCode *hsailCode; + }; +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc new file mode 100644 index 000000000..4e70bf46a --- /dev/null +++ b/src/arch/hsail/insts/main.cc @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/insts/decl.hh" +#include "debug/GPUExec.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/simple_pool_manager.hh" + +namespace HsailISA +{ + template<> const char *B1::label = "b1"; + template<> const char *B8::label = "b8"; + template<> const char *B16::label = "b16"; + template<> const char *B32::label = "b32"; + template<> const char *B64::label = "b64"; + + template<> const char *S8::label = "s8"; + template<> const char *S16::label = "s16"; + template<> const char *S32::label = "s32"; + template<> const char *S64::label = "s64"; + + template<> const char *U8::label = "u8"; + template<> const char *U16::label = "u16"; + template<> const char *U32::label = "u32"; + template<> const char *U64::label = "u64"; + + template<> const char *F32::label = "f32"; + template<> const char *F64::label = "f64"; + + const char* + cmpOpToString(Brig::BrigCompareOperation cmpOp) + { + using namespace Brig; + + switch (cmpOp) { + case BRIG_COMPARE_EQ: + return "eq"; + case BRIG_COMPARE_NE: + return "ne"; + case BRIG_COMPARE_LT: + return "lt"; + case BRIG_COMPARE_LE: + return "le"; + case BRIG_COMPARE_GT: + return "gt"; + case BRIG_COMPARE_GE: + return "ge"; + case BRIG_COMPARE_EQU: + return "equ"; + case BRIG_COMPARE_NEU: + return "neu"; + case BRIG_COMPARE_LTU: + return "ltu"; + case BRIG_COMPARE_LEU: + return "leu"; + case BRIG_COMPARE_GTU: + return "gtu"; + case BRIG_COMPARE_GEU: + return "geu"; + case BRIG_COMPARE_NUM: + return "num"; + case BRIG_COMPARE_NAN: + return "nan"; + case BRIG_COMPARE_SEQ: + return "seq"; + case BRIG_COMPARE_SNE: + return "sne"; + case BRIG_COMPARE_SLT: + return "slt"; + case BRIG_COMPARE_SLE: + return "sle"; + case BRIG_COMPARE_SGT: + return "sgt"; + case BRIG_COMPARE_SGE: + return "sge"; + case BRIG_COMPARE_SGEU: + return "sgeu"; + case BRIG_COMPARE_SEQU: + return "sequ"; + case BRIG_COMPARE_SNEU: + return "sneu"; + case BRIG_COMPARE_SLTU: + return "sltu"; + case BRIG_COMPARE_SLEU: + return "sleu"; + case BRIG_COMPARE_SNUM: + return "snum"; + case BRIG_COMPARE_SNAN: + return "snan"; + case BRIG_COMPARE_SGTU: + return "sgtu"; + default: + return "unknown"; + } + } + + void + Ret::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + // mask off completed work-items + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->init_mask[lane] = 0; + } + + } + + // delete extra instructions fetched for completed work-items + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) { + w->dropFetch = true; + } + + // if all work-items have completed, then wave-front is done + if (w->init_mask.none()) { + w->status = Wavefront::S_STOPPED; + + int32_t refCount = w->computeUnit->getLds(). + decreaseRefCounter(w->dispatchid, w->wg_id); + + DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n", + w->computeUnit->cu_id, w->wg_id, refCount); + + // free the vector registers of the completed wavefront + w->computeUnit->vectorRegsReserved[w->simdId] -= + w->reservedVectorRegs; + + assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0); + + uint32_t endIndex = (w->startVgprIndex + + w->reservedVectorRegs - 1) % + w->computeUnit->vrf[w->simdId]->numRegs(); + + w->computeUnit->vrf[w->simdId]->manager-> + freeRegion(w->startVgprIndex, endIndex); + + w->reservedVectorRegs = 0; + w->startVgprIndex = 0; + w->computeUnit->completedWfs++; + + DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId); + + if (!refCount) { + // Notify Memory System of Kernel Completion + // Kernel End = isKernel + isRelease + w->status = Wavefront::S_RETURNING; + GPUDynInstPtr local_mempacket = gpuDynInst; + local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE; + local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM; + local_mempacket->useContinuation = false; + local_mempacket->simdId = w->simdId; + local_mempacket->wfSlotId = w->wfSlotId; + local_mempacket->wfDynId = w->wfDynId; + w->computeUnit->injectGlobalMemFence(local_mempacket, true); + } else { + w->computeUnit->shader->dispatcher->scheduleDispatch(); + } + } + } + + void + Barrier::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + assert(w->barrier_cnt == w->old_barrier_cnt); + w->barrier_cnt = w->old_barrier_cnt + 1; + w->stalledAtBarrier = true; + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc new file mode 100644 index 000000000..97d4c902b --- /dev/null +++ b/src/arch/hsail/insts/mem.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/insts/mem.hh" + +#include "arch/hsail/Brig.h" +#include "enums/OpType.hh" + +using namespace Brig; + +namespace HsailISA +{ + const char* atomicOpToString(BrigAtomicOperation brigOp); + + Enums::MemOpType + brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp) + { + if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return Enums::MO_AAND; + case BRIG_ATOMIC_OR: + return Enums::MO_AOR; + case BRIG_ATOMIC_XOR: + return Enums::MO_AXOR; + case BRIG_ATOMIC_CAS: + return Enums::MO_ACAS; + case BRIG_ATOMIC_EXCH: + return Enums::MO_AEXCH; + case BRIG_ATOMIC_ADD: + return Enums::MO_AADD; + case BRIG_ATOMIC_WRAPINC: + return Enums::MO_AINC; + case BRIG_ATOMIC_WRAPDEC: + return Enums::MO_ADEC; + case BRIG_ATOMIC_MIN: + return Enums::MO_AMIN; + case BRIG_ATOMIC_MAX: + return Enums::MO_AMAX; + case BRIG_ATOMIC_SUB: + return Enums::MO_ASUB; + default: + fatal("Bad BrigAtomicOperation code %d\n", brigOp); + } + } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return Enums::MO_ANRAND; + case BRIG_ATOMIC_OR: + return Enums::MO_ANROR; + case BRIG_ATOMIC_XOR: + return Enums::MO_ANRXOR; + case BRIG_ATOMIC_CAS: + return Enums::MO_ANRCAS; + case BRIG_ATOMIC_EXCH: + return Enums::MO_ANREXCH; + case BRIG_ATOMIC_ADD: + return Enums::MO_ANRADD; + case BRIG_ATOMIC_WRAPINC: + return Enums::MO_ANRINC; + case BRIG_ATOMIC_WRAPDEC: + return Enums::MO_ANRDEC; + case BRIG_ATOMIC_MIN: + return Enums::MO_ANRMIN; + case BRIG_ATOMIC_MAX: + return Enums::MO_ANRMAX; + case BRIG_ATOMIC_SUB: + return Enums::MO_ANRSUB; + default: + fatal("Bad BrigAtomicOperation code %d\n", brigOp); + } + } else { + fatal("Bad BrigAtomicOpcode %d\n", brigOpCode); + } + } + + const char* + atomicOpToString(BrigAtomicOperation brigOp) + { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return "and"; + case BRIG_ATOMIC_OR: + return "or"; + case BRIG_ATOMIC_XOR: + return "xor"; + case BRIG_ATOMIC_CAS: + return "cas"; + case BRIG_ATOMIC_EXCH: + return "exch"; + case BRIG_ATOMIC_ADD: + return "add"; + case BRIG_ATOMIC_WRAPINC: + return "inc"; + case BRIG_ATOMIC_WRAPDEC: + return "dec"; + case BRIG_ATOMIC_MIN: + return "min"; + case BRIG_ATOMIC_MAX: + return "max"; + case BRIG_ATOMIC_SUB: + return "sub"; + default: + return "unknown"; + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh new file mode 100644 index 000000000..d3ce76dee --- /dev/null +++ b/src/arch/hsail/insts/mem.hh @@ -0,0 +1,1629 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ +#define __ARCH_HSAIL_INSTS_MEM_HH__ + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" + +namespace HsailISA +{ + class MemInst + { + public: + MemInst() : size(0), addr_operand(nullptr) { } + + MemInst(Enums::MemType m_type) + { + if (m_type == Enums::M_U64 || + m_type == Enums::M_S64 || + m_type == Enums::M_F64) { + size = 8; + } else if (m_type == Enums::M_U32 || + m_type == Enums::M_S32 || + m_type == Enums::M_F32) { + size = 4; + } else if (m_type == Enums::M_U16 || + m_type == Enums::M_S16 || + m_type == Enums::M_F16) { + size = 2; + } else { + size = 1; + } + + addr_operand = nullptr; + } + + void + init_addr(AddrOperandBase *_addr_operand) + { + addr_operand = _addr_operand; + } + + private: + int size; + AddrOperandBase *addr_operand; + + public: + int getMemOperandSize() { return size; } + AddrOperandBase *getAddressOperand() { return addr_operand; } + }; + + template + class LdaInstBase : public HsailGPUStaticInst + { + public: + typename DestOperandType::DestOperand dest; + AddrOperandType addr; + + LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + int numSrcRegOperands() { return(this->addr.isVectorRegister()); } + int numDstRegOperands() { return dest.isVectorRegister(); } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isVectorRegister() : + this->addr.isVectorRegister()); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isCondRegister() : + this->addr.isCondRegister()); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isScalarRegister() : + this->addr.isScalarRegister()); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex > 0) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return(operandIndex == 0); + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.opSize() : + this->addr.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.regIndex() : + this->addr.regIndex()); + } + int getNumOperands() + { + if (this->addr.isVectorRegister()) + return 2; + return 1; + } + }; + + template + class LdaInst : + public LdaInstBase, + public MemInst + { + public: + void generateDisassembly(); + + LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : LdaInstBase(ib, obj, _opcode) + { + init_addr(&this->addr); + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + template + GPUStaticInst* + decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); + + if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new LdaInst(ib, obj, "ldas"); + } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (regDataType.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new LdaInst(ib, obj, "ldas"); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new LdaInst(ib, obj, "ldas"); + default: + fatal("Bad ldas register operand type %d\n", regDataType.type); + } + } else { + fatal("Bad ldas register operand kind %d\n", regDataType.kind); + } + } + + template + class LdInstBase : public HsailGPUStaticInst + { + public: + Brig::BrigWidth8_t width; + typename DestOperandType::DestOperand dest; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryOrder memoryOrder; + Brig::BrigMemoryScope memoryScope; + unsigned int equivClass; + bool isArgLoad() + { + return segment == Brig::BRIG_SEGMENT_KERNARG || + segment == Brig::BRIG_SEGMENT_ARG; + } + void + initLd(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstMem *ldst = (const BrigInstMem*)ib; + + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_READ; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_READ; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_READ; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_READ; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_READ; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_READ; + break; + + case BRIG_SEGMENT_KERNARG: + o_type = Enums::OT_KERN_READ; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("Ld: segment %d not supported\n", segment); + } + + width = ldst->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + void + initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + memoryScope = (BrigMemoryScope)at->memoryScope; + equivClass = 0; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_READ; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_READ; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_READ; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_READ; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_READ; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_READ; + break; + + case BRIG_SEGMENT_KERNARG: + o_type = Enums::OT_KERN_READ; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("Ld: segment %d not supported\n", segment); + } + + width = BRIG_WIDTH_1; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands,1); + addr.init(op_offs, obj); + } + + LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + if (ib->opcode == BRIG_OPCODE_LD) { + initLd(ib, obj, _opcode); + } else { + initAtomicLd(ib, obj, _opcode); + } + } + + int numSrcRegOperands() { return(this->addr.isVectorRegister()); } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() + { + if (this->addr.isVectorRegister()) + return 2; + else + return 1; + } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isVectorRegister() : + this->addr.isVectorRegister()); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isCondRegister() : + this->addr.isCondRegister()); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isScalarRegister() : + this->addr.isScalarRegister()); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex > 0) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return(operandIndex == 0); + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.opSize() : + this->addr.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.regIndex() : + this->addr.regIndex()); + } + }; + + template + class LdInst : + public LdInstBase, + public MemInst + { + typename DestDataType::OperandType::DestOperand dest_vect[4]; + uint16_t num_dest_operands; + void generateDisassembly(); + + public: + LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : LdInstBase(ib, obj, _opcode), + MemInst(MemDataType::memType) + { + init_addr(&this->addr); + + unsigned op_offs = obj->getOperandPtr(ib->operands,0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + + if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)brigOp; + + num_dest_operands = + *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; + + assert(num_dest_operands <= 4); + } else { + num_dest_operands = 1; + } + + if (num_dest_operands > 1) { + assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + + for (int i = 0; i < num_dest_operands; ++i) { + dest_vect[i].init_from_vect(op_offs, obj, i); + } + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c0; + + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + if (num_dest_operands > 1) { + for (int i = 0; i < VSZ; ++i) + if (gpuDynInst->exec_mask[i]) + gpuDynInst->statusVector.push_back(num_dest_operands); + else + gpuDynInst->statusVector.push_back(0); + } + + for (int k = 0; k < num_dest_operands; ++k) { + + c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); + + if (isLocalMem()) { + // load from shared memory + *d = gpuDynInst->wavefront()->ldsChunk-> + read(vaddr); + } else { + Request *req = new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->dataStatic(d); + + if (gpuDynInst->computeUnit()->shader-> + separate_acquire_release && + gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + // if this load has acquire semantics, + // set the response continuation function + // to perform an Acquire request + gpuDynInst->execContinuation = + &GPUStaticInst::execLdAcq; + + gpuDynInst->useContinuation = true; + } else { + // the request will be finished when + // the load completes + gpuDynInst->useContinuation = false; + } + // translation is performed in sendRequest() + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, + i, pkt); + } + } + ++d; + } + } + + gpuDynInst->updateStats(); + } + + private: + void + execLdAcq(GPUDynInstPtr gpuDynInst) override + { + // after the load has complete and if the load has acquire + // semantics, issue an acquire request. + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->useContinuation = false; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::ACQUIRE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + } + } + } + + public: + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isVectorRegister()); + if (num_dest_operands > 1) { + return dest_vect[operandIndex].isVectorRegister(); + } + else if (num_dest_operands == 1) { + return LdInstBase::dest.isVectorRegister(); + } + return false; + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isCondRegister()); + if (num_dest_operands > 1) + return dest_vect[operandIndex].isCondRegister(); + else if (num_dest_operands == 1) + return LdInstBase::dest.isCondRegister(); + return false; + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isScalarRegister()); + if (num_dest_operands > 1) + return dest_vect[operandIndex].isScalarRegister(); + else if (num_dest_operands == 1) + return LdInstBase::dest.isScalarRegister(); + return false; + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return false; + return true; + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.opSize()); + if (num_dest_operands > 1) + return(dest_vect[operandIndex].opSize()); + else if (num_dest_operands == 1) + return(LdInstBase::dest.opSize()); + return 0; + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.regIndex()); + if (num_dest_operands > 1) + return(dest_vect[operandIndex].regIndex()); + else if (num_dest_operands == 1) + return(LdInstBase::dest.regIndex()); + return -1; + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return(num_dest_operands+1); + else + return(num_dest_operands); + } + void execute(GPUDynInstPtr gpuDynInst); + }; + + template + GPUStaticInst* + decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands,1); + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new LdInst(ib, obj, "ld"); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || + tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new LdInst(ib, obj, "ld"); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new LdInst(ib, obj, "ld"); + default: + fatal("Bad ld register operand type %d\n", tmp.regKind); + } + } else { + fatal("Bad ld register operand kind %d\n", tmp.kind); + } + } + + template + GPUStaticInst* + decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands,0); + BrigRegOperandInfo dest = findRegDataType(op_offs, obj); + + assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || + dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + switch(dest.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + switch (ib->type) { + case Brig::BRIG_TYPE_B8: + case Brig::BRIG_TYPE_B16: + case Brig::BRIG_TYPE_B32: + return decodeLd2(ib, obj); + case Brig::BRIG_TYPE_U8: + case Brig::BRIG_TYPE_U16: + case Brig::BRIG_TYPE_U32: + return decodeLd2(ib, obj); + case Brig::BRIG_TYPE_S8: + case Brig::BRIG_TYPE_S16: + case Brig::BRIG_TYPE_S32: + return decodeLd2(ib, obj); + case Brig::BRIG_TYPE_F16: + case Brig::BRIG_TYPE_F32: + return decodeLd2(ib, obj); + default: + fatal("Bad ld register operand type %d, %d\n", + dest.regKind, ib->type); + }; + case Brig::BRIG_REGISTER_KIND_DOUBLE: + switch (ib->type) { + case Brig::BRIG_TYPE_B64: + return decodeLd2(ib, obj); + case Brig::BRIG_TYPE_U64: + return decodeLd2(ib, obj); + case Brig::BRIG_TYPE_S64: + return decodeLd2(ib, obj); + case Brig::BRIG_TYPE_F64: + return decodeLd2(ib, obj); + default: + fatal("Bad ld register operand type %d, %d\n", + dest.regKind, ib->type); + }; + default: + fatal("Bad ld register operand type %d, %d\n", dest.regKind, + ib->type); + } + } + + template + class StInstBase : public HsailGPUStaticInst + { + public: + typename SrcOperandType::SrcOperand src; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryScope memoryScope; + Brig::BrigMemoryOrder memoryOrder; + unsigned int equivClass; + + void + initSt(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstMem *ldst = (const BrigInstMem*)ib; + + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_WRITE; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_WRITE; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_WRITE; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_WRITE; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_WRITE; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_WRITE; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("St: segment %d not supported\n", segment); + } + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const BrigOperand *baseOp = obj->getOperand(op_offs); + + if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || + (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { + src.init(op_offs, obj); + } + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + void + initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + equivClass = 0; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_WRITE; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_WRITE; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_WRITE; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_WRITE; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_WRITE; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_WRITE; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("St: segment %d not supported\n", segment); + } + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src.init(op_offs, obj); + } + + StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + if (ib->opcode == BRIG_OPCODE_ST) { + initSt(ib, obj, _opcode); + } else { + initAtomicSt(ib, obj, _opcode); + } + } + + int numDstRegOperands() { return 0; } + int numSrcRegOperands() + { + return src.isVectorRegister() + this->addr.isVectorRegister(); + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return 2; + else + return 1; + } + bool isVectorRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isVectorRegister() : + this->addr.isVectorRegister(); + } + bool isCondRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isCondRegister() : + this->addr.isCondRegister(); + } + bool isScalarRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isScalarRegister() : + this->addr.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.opSize() : this->addr.opSize(); + } + int getRegisterIndex(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.regIndex() : this->addr.regIndex(); + } + }; + + + template + class StInst : + public StInstBase, + public MemInst + { + public: + typename SrcDataType::OperandType::SrcOperand src_vect[4]; + uint16_t num_src_operands; + void generateDisassembly(); + + StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode, int srcIdx) + : StInstBase(ib, obj, _opcode), + MemInst(SrcDataType::memType) + { + init_addr(&this->addr); + + BrigRegOperandInfo rinfo; + unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); + const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); + + if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + const Brig::BrigOperandConstantBytes *op = + (Brig::BrigOperandConstantBytes*)baseOp; + + rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, + Brig::BRIG_TYPE_NONE); + } else { + rinfo = findRegDataType(op_offs, obj); + } + + if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)baseOp; + + num_src_operands = + *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; + + assert(num_src_operands <= 4); + } else { + num_src_operands = 1; + } + + if (num_src_operands > 1) { + assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + + for (int i = 0; i < num_src_operands; ++i) { + src_vect[i].init_from_vect(op_offs, obj, i); + } + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + // before performing a store, check if this store has + // release semantics, and if so issue a release first + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_RELEASE) { + + gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->execContinuation = &GPUStaticInst::execSt; + gpuDynInst->useContinuation = true; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::RELEASE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + + return; + } + } + + // if there is no release semantic, perform stores immediately + execSt(gpuDynInst); + } + + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + private: + // execSt may be called through a continuation + // if the store had release semantics. see comment for + // execSt in gpu_static_inst.hh + void + execSt(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c0; + + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + if (num_src_operands > 1) { + for (int i = 0; i < VSZ; ++i) + if (gpuDynInst->exec_mask[i]) + gpuDynInst->statusVector.push_back(num_src_operands); + else + gpuDynInst->statusVector.push_back(0); + } + + for (int k = 0; k < num_src_operands; ++k) { + c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); + + if (isLocalMem()) { + //store to shared memory + gpuDynInst->wavefront()->ldsChunk->write(vaddr, + *d); + } else { + Request *req = + new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->dataStatic(d); + + // translation is performed in sendRequest() + // the request will be finished when the store completes + gpuDynInst->useContinuation = false; + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, + i, pkt); + + } + } + ++d; + } + } + + gpuDynInst->updateStats(); + } + + public: + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isVectorRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isVectorRegister(); + else if (num_src_operands == 1) + return StInstBase::src.isVectorRegister(); + return false; + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isCondRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isCondRegister(); + else if (num_src_operands == 1) + return StInstBase::src.isCondRegister(); + return false; + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isScalarRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isScalarRegister(); + else if (num_src_operands == 1) + return StInstBase::src.isScalarRegister(); + return false; + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.opSize(); + if (num_src_operands > 1) + return src_vect[operandIndex].opSize(); + else if (num_src_operands == 1) + return StInstBase::src.opSize(); + return 0; + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.regIndex(); + if (num_src_operands > 1) + return src_vect[operandIndex].regIndex(); + else if (num_src_operands == 1) + return StInstBase::src.regIndex(); + return -1; + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return num_src_operands + 1; + else + return num_src_operands; + } + void execute(GPUDynInstPtr gpuDynInst); + }; + + template + GPUStaticInst* + decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + int srcIdx = 0; + int destIdx = 1; + if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || + ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { + srcIdx = 1; + destIdx = 0; + } + unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); + + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new StInst(ib, obj, "st", srcIdx); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new StInst(ib, obj, "st", srcIdx); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new StInst(ib, obj, "st", srcIdx); + default: + fatal("Bad st register operand type %d\n", tmp.type); + } + } else { + fatal("Bad st register operand kind %d\n", tmp.kind); + } + } + + Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode, + Brig::BrigAtomicOperation brigOp); + + template + class AtomicInstBase : public HsailGPUStaticInst + { + public: + typename OperandType::DestOperand dest; + typename OperandType::SrcOperand src[NumSrcOperands]; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryOrder memoryOrder; + Brig::BrigAtomicOperation atomicOperation; + Brig::BrigMemoryScope memoryScope; + Brig::BrigOpcode opcode; + Enums::MemOpType opType; + + AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + atomicOperation = (BrigAtomicOperation)at->atomicOperation; + opcode = (BrigOpcode)ib->opcode; + opType = brigAtomicToMemOpType(opcode, atomicOperation); + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_ATOMIC; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_ATOMIC; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_ATOMIC; + break; + + default: + panic("Atomic: segment %d not supported\n", segment); + } + + if (HasDst) { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 2); + src[i].init(op_offs, obj); + } + } else { + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 1); + src[i].init(op_offs, obj); + } + } + } + + int numSrcRegOperands() + { + int operands = 0; + for (int i = 0; i < NumSrcOperands; i++) { + if (src[i].isVectorRegister() == true) { + operands++; + } + } + if (addr.isVectorRegister()) + operands++; + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() + { + if (addr.isVectorRegister()) + return(NumSrcOperands + 2); + return(NumSrcOperands + 1); + } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isVectorRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isVectorRegister()); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isCondRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isCondRegister()); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isScalarRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isScalarRegister()); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return true; + else if (operandIndex == NumSrcOperands) + return(addr.isVectorRegister()); + else + return false; + } + bool isDstOperand(int operandIndex) + { + if (operandIndex <= NumSrcOperands) + return false; + else + return true; + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return(src[operandIndex].opSize()); + else if (operandIndex == NumSrcOperands) + return(addr.opSize()); + else + return(dest.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return(src[operandIndex].regIndex()); + else if (operandIndex == NumSrcOperands) + return(addr.regIndex()); + else + return(dest.regIndex()); + return -1; + } + }; + + template + class AtomicInst : + public AtomicInstBase, + public MemInst + { + public: + void generateDisassembly(); + + AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : AtomicInstBase + (ib, obj, _opcode), + MemInst(MemDataType::memType) + { + init_addr(&this->addr); + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + // before doing the RMW, check if this atomic has + // release semantics, and if so issue a release first + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && (gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) { + + gpuDynInst->statusBitVector = VectorMask(1); + + gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; + gpuDynInst->useContinuation = true; + + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::RELEASE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + + return; + } + } + + // if there is no release semantic, execute the RMW immediately + execAtomic(gpuDynInst); + + } + + void execute(GPUDynInstPtr gpuDynInst); + + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + private: + // execAtomic may be called through a continuation + // if the RMW had release semantics. see comment for + // execContinuation in gpu_dyn_inst.hh + void + execAtomic(GPUDynInstPtr gpuDynInst) override + { + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + typedef typename MemDataType::CType c0; + + c0 *d = &((c0*) gpuDynInst->d_data)[0]; + c0 *e = &((c0*) gpuDynInst->a_data)[0]; + c0 *f = &((c0*) gpuDynInst->x_data)[0]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i]; + + if (isLocalMem()) { + Wavefront *wavefront = gpuDynInst->wavefront(); + *d = wavefront->ldsChunk->read(vaddr); + + switch (this->opType) { + case Enums::MO_AADD: + case Enums::MO_ANRADD: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) + (*e)); + break; + case Enums::MO_ASUB: + case Enums::MO_ANRSUB: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) - (*e)); + break; + case Enums::MO_AMAX: + case Enums::MO_ANRMAX: + wavefront->ldsChunk->write(vaddr, + std::max(wavefront->ldsChunk->read(vaddr), + (*e))); + break; + case Enums::MO_AMIN: + case Enums::MO_ANRMIN: + wavefront->ldsChunk->write(vaddr, + std::min(wavefront->ldsChunk->read(vaddr), + (*e))); + break; + case Enums::MO_AAND: + case Enums::MO_ANRAND: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) & (*e)); + break; + case Enums::MO_AOR: + case Enums::MO_ANROR: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) | (*e)); + break; + case Enums::MO_AXOR: + case Enums::MO_ANRXOR: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) ^ (*e)); + break; + case Enums::MO_AINC: + case Enums::MO_ANRINC: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) + 1); + break; + case Enums::MO_ADEC: + case Enums::MO_ANRDEC: + wavefront->ldsChunk->write(vaddr, + wavefront->ldsChunk->read(vaddr) - 1); + break; + case Enums::MO_AEXCH: + case Enums::MO_ANREXCH: + wavefront->ldsChunk->write(vaddr, (*e)); + break; + case Enums::MO_ACAS: + case Enums::MO_ANRCAS: + wavefront->ldsChunk->write(vaddr, + (wavefront->ldsChunk->read(vaddr) == (*e)) ? + (*f) : wavefront->ldsChunk->read(vaddr)); + break; + default: + fatal("Unrecognized or invalid HSAIL atomic op " + "type.\n"); + break; + } + } else { + Request *req = + new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i, + gpuDynInst->makeAtomicOpFunctor(e, + f, this->opType)); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::SwapReq); + pkt->dataStatic(d); + + if (gpuDynInst->computeUnit()->shader-> + separate_acquire_release && + (gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE)) { + // if this atomic has acquire semantics, + // schedule the continuation to perform an + // acquire after the RMW completes + gpuDynInst->execContinuation = + &GPUStaticInst::execAtomicAcq; + + gpuDynInst->useContinuation = true; + } else { + // the request will be finished when the RMW completes + gpuDynInst->useContinuation = false; + } + // translation is performed in sendRequest() + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, + pkt); + } + } + + ++d; + ++e; + ++f; + } + + gpuDynInst->updateStats(); + } + + // execAtomicACq will always be called through a continuation. + // see comment for execContinuation in gpu_dyn_inst.hh + void + execAtomicAcq(GPUDynInstPtr gpuDynInst) override + { + // after performing the RMW, check to see if this instruction + // has acquire semantics, and if so, issue an acquire + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->statusBitVector = VectorMask(1); + + // the request will be finished when + // the acquire completes + gpuDynInst->useContinuation = false; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::ACQUIRE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + } + } + } + }; + + template + GPUStaticInst* + constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + + if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { + return decodeLd(ib, obj); + } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { + switch (ib->type) { + case Brig::BRIG_TYPE_B8: + return decodeSt(ib, obj); + case Brig::BRIG_TYPE_B16: + return decodeSt(ib, obj); + case Brig::BRIG_TYPE_B32: + return decodeSt(ib, obj); + case Brig::BRIG_TYPE_B64: + return decodeSt(ib, obj); + default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); + } + } else { + if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) + return new AtomicInst(ib, obj, "atomicnoret"); + else + return new AtomicInst(ib, obj, "atomic"); + } + } + + template + GPUStaticInst* + decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == + Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; + + unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); + + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return constructAtomic(ib, obj); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return constructAtomic(ib, obj); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return constructAtomic(ib, obj); + default: + fatal("Bad atomic register operand type %d\n", tmp.type); + } + } else { + fatal("Bad atomic register operand kind %d\n", tmp.kind); + } + } + + + template + GPUStaticInst* + decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + + if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { + return decodeAtomicHelper(ib, obj); + } else { + return decodeAtomicHelper(ib, obj); + } + } + + template + GPUStaticInst* + decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { + return decodeAtomicHelper(ib, obj); + } else { + return decodeAtomicHelper(ib, obj); + } + } +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_MEM_HH__ diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh new file mode 100644 index 000000000..94f0cd6aa --- /dev/null +++ b/src/arch/hsail/insts/mem_impl.hh @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/generic_types.hh" +#include "gpu-compute/hsail_code.hh" + +// defined in code.cc, but not worth sucking in all of code.h for this +// at this point +extern const char *segmentNames[]; + +namespace HsailISA +{ + template + void + LdaInst::generateDisassembly() + { + this->disassembly = csprintf("%s_%s %s,%s", this->opcode, + DestDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + } + + template + void + LdaInst::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename DestDataType::CType CType M5_VAR_USED; + const VectorMask &mask = w->get_pred(); + uint64_t addr_vec[VSZ]; + this->addr.calcVector(w, addr_vec); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, addr_vec[lane]); + } + } + } + + template + void + LdInst::generateDisassembly() + { + switch (num_dest_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->dest_vect[2].disassemble(), + this->dest_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: + fatal("Bad ld register dest operand, num vector operands: %d \n", + num_dest_operands); + break; + } + } + + static Addr + calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i) + { + // what is the size of the object we are accessing?? + // NOTE: the compiler doesn't generate enough information + // to do this yet..have to just line up all the private + // work-item spaces back to back for now + /* + StorageElement* se = + i->parent->findSymbol(Brig::BrigPrivateSpace, addr); + assert(se); + + return w->wfSlotId * w->privSizePerItem * VSZ + + se->offset * VSZ + + lane * se->size; + */ + + // addressing strategy: interleave the private spaces of + // work-items in a wave-front on 8 byte granularity. + // this won't be perfect coalescing like the spill space + // strategy, but it's better than nothing. The spill space + // strategy won't work with private because the same address + // may be accessed by different sized loads/stores. + + // Note: I'm assuming that the largest load/store to private + // is 8 bytes. If it is larger, the stride will have to increase + + Addr addr_div8 = addr / 8; + Addr addr_mod8 = addr % 8; + + Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase; + + assert(ret < w->privBase + (w->privSizePerItem * VSZ)); + + return ret; + } + + template + void + LdInst::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename MemDataType::CType MemCType; + const VectorMask &mask = w->get_pred(); + + // Kernarg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_KERNARG) { + MemCType val; + + // I assume no vector ld for kernargs + assert(num_dest_operands == 1); + + // assuming for the moment that we'll never do register + // offsets into kernarg space... just to make life simpler + uint64_t address = this->addr.calcUniform(); + + val = *(MemCType*)&w->kernelArgs[address]; + + DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, val); + } + } + + return; + } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + MemCType val = w->readCallArgMem(lane, address); + + DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address, + (unsigned long long)val); + + this->dest.set(w, lane, val); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + m->m_op = Enums::MO_LD; + m->m_type = MemDataType::memType; + m->v_type = DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = this->equivClass; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (num_dest_operands == 1) { + m->dst_reg = this->dest.regIndex(); + m->n_reg = 1; + } else { + m->n_reg = num_dest_operands; + for (int i = 0; i < num_dest_operands; ++i) { + m->dst_reg_vec[i] = this->dest_vect[i].regIndex(); + } + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (m->addr[lane] < w->privSizePerItem) { + if (mask[lane]) { + // what is the size of the object we are accessing? + // find base for for this wavefront + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(MemCType) - 1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_dest_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + // note: this calculation will NOT WORK if the compiler + // ever generates loads/stores to the same address with + // different widths (e.g., a ld_u32 addr and a ld_u16 addr) + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(MemCType) + w->spillBase; + + w->last_addr[lane] = m->addr[lane]; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_READONLY: + m->s_type = SEG_READONLY; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); + m->addr[lane] += w->roBase; + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + + m->addr[lane] = m->addr[lane] + + lane * sizeof(MemCType) + w->privBase; + } + } + } + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + default: + fatal("Load to unsupported segment %d %llxe\n", this->segment, + m->addr[0]); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template + void + StInst::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename OperationType::CType CType; + + const VectorMask &mask = w->get_pred(); + + // arg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType data = this->src.template get(w, lane); + DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); + w->writeCallArgMem(lane, address, data); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + m->exec_mask = w->execMask(); + + this->addr.calcVector(w, m->addr); + + if (num_src_operands == 1) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[lane] = + this->src.template get(w, lane); + } + } + } else { + for (int k= 0; k < num_src_operands; ++k) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[k * VSZ + lane] = + this->src_vect[k].template get(w, lane); + } + } + } + } + + m->m_op = Enums::MO_ST; + m->m_type = OperationType::memType; + m->v_type = OperationType::vgprType; + + m->statusBitVector = 0; + m->equiv = this->equivClass; + + if (num_src_operands == 1) { + m->n_reg = 1; + } else { + m->n_reg = num_src_operands; + } + + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + if (m->addr[lane] < w->privSizePerItem) { + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(CType)-1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_src_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(CType) + w->spillBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + m->addr[lane] = m->addr[lane] + lane * + sizeof(CType)+w->privBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + default: + fatal("Store to unsupported segment %d\n", this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template + void + StInst::generateDisassembly() + { + switch (num_src_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->src_vect[2].disassemble(), + this->src_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: fatal("Bad ld register src operand, num vector operands: " + "%d \n", num_src_operands); + break; + } + } + + template + void + AtomicInst::execute(GPUDynInstPtr gpuDynInst) + { + typedef typename DataType::CType CType; + + Wavefront *w = gpuDynInst->wavefront(); + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + for (int lane = 0; lane < VSZ; ++lane) { + ((CType *)m->a_data)[lane] = + this->src[0].template get(w, lane); + } + + // load second source operand for CAS + if (NumSrcOperands > 1) { + for (int lane = 0; lane < VSZ; ++lane) { + ((CType*)m->x_data)[lane] = + this->src[1].template get(w, lane); + } + } + + assert(NumSrcOperands <= 2); + + m->m_op = this->opType; + m->m_type = DataType::memType; + m->v_type = DataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (HasDst) { + m->dst_reg = this->dest.regIndex(); + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->latency.set(w->computeUnit->shader->ticks(64)); + m->pipeId = GLBMEM_PIPE; + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + default: + fatal("Atomic op to unsupported segment %d\n", + this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp); + + template + void + AtomicInst::generateDisassembly() + { + if (HasDst) { + this->disassembly = + csprintf("%s_%s_%s_%s %s,%s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->dest.disassemble(), + this->addr.disassemble()); + } else { + this->disassembly = + csprintf("%s_%s_%s_%s %s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->addr.disassemble()); + } + + for (int i = 0; i < NumSrcOperands; ++i) { + this->disassembly += ","; + this->disassembly += this->src[i].disassemble(); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc new file mode 100644 index 000000000..9506a80ab --- /dev/null +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -0,0 +1,787 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Marc Orr + */ + +#include + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/mem.hh" + +namespace HsailISA +{ + // Pseudo (or magic) instructions are overloaded on the hsail call + // instruction, because of its flexible parameter signature. + + // To add a new magic instruction: + // 1. Add an entry to the enum. + // 2. Implement it in the switch statement below (Call::exec). + // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h, + // so its easy to call from an OpenCL kernel. + + // This enum should be identical to the enum in + // hsa/hsail-gpu-compute/util/magicinst.h + enum + { + MAGIC_PRINT_WF_32 = 0, + MAGIC_PRINT_WF_64, + MAGIC_PRINT_LANE, + MAGIC_PRINT_LANE_64, + MAGIC_PRINT_WF_FLOAT, + MAGIC_SIM_BREAK, + MAGIC_PREF_SUM, + MAGIC_REDUCTION, + MAGIC_MASKLANE_LOWER, + MAGIC_MASKLANE_UPPER, + MAGIC_JOIN_WF_BAR, + MAGIC_WAIT_WF_BAR, + MAGIC_PANIC, + MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG, + MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG, + MAGIC_LOAD_GLOBAL_U32_REG, + MAGIC_XACT_CAS_LD, + MAGIC_MOST_SIG_THD, + MAGIC_MOST_SIG_BROADCAST, + MAGIC_PRINT_WFID_32, + MAGIC_PRINT_WFID_64 + }; + + void + Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + const VectorMask &mask = w->get_pred(); + + int op = 0; + bool got_op = false; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val0 = src1.get(w, lane, 0); + if (got_op) { + if (src_val0 != op) { + fatal("Multiple magic instructions per PC not " + "supported\n"); + } + } else { + op = src_val0; + got_op = true; + } + } + } + + switch(op) { + case MAGIC_PRINT_WF_32: + MagicPrintWF32(w); + break; + case MAGIC_PRINT_WF_64: + MagicPrintWF64(w); + break; + case MAGIC_PRINT_LANE: + MagicPrintLane(w); + break; + case MAGIC_PRINT_LANE_64: + MagicPrintLane64(w); + break; + case MAGIC_PRINT_WF_FLOAT: + MagicPrintWFFloat(w); + break; + case MAGIC_SIM_BREAK: + MagicSimBreak(w); + break; + case MAGIC_PREF_SUM: + MagicPrefixSum(w); + break; + case MAGIC_REDUCTION: + MagicReduction(w); + break; + case MAGIC_MASKLANE_LOWER: + MagicMaskLower(w); + break; + case MAGIC_MASKLANE_UPPER: + MagicMaskUpper(w); + break; + case MAGIC_JOIN_WF_BAR: + MagicJoinWFBar(w); + break; + case MAGIC_WAIT_WF_BAR: + MagicWaitWFBar(w); + break; + case MAGIC_PANIC: + MagicPanic(w); + break; + + // atomic instructions + case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG: + MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG: + MagicAtomicNRAddGroupU32Reg(w, gpuDynInst); + break; + + case MAGIC_LOAD_GLOBAL_U32_REG: + MagicLoadGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_XACT_CAS_LD: + MagicXactCasLd(w); + break; + + case MAGIC_MOST_SIG_THD: + MagicMostSigThread(w); + break; + + case MAGIC_MOST_SIG_BROADCAST: + MagicMostSigBroadcast(w); + break; + + case MAGIC_PRINT_WFID_32: + MagicPrintWF32ID(w); + break; + + case MAGIC_PRINT_WFID_64: + MagicPrintWFID64(w); + break; + + default: fatal("unrecognized magic instruction: %d\n", op); + } + } + + void + Call::MagicPrintLane(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintLane64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int64_t src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintWF32(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWF32ID(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + src_val3 = src1.get(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWF64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWFID64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + src_val3 = src1.get(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWFFloat(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + float src_val1 = src1.get(w, lane, 1); + res_str += csprintf("%08f", src_val1); + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + // raises a signal that GDB will catch + // when done with the break, type "signal 0" in gdb to continue + void + Call::MagicSimBreak(Wavefront *w) + { + std::string res_str; + // print out state for this wavefront and then break + res_str = csprintf("Breakpoint encountered for wavefront %i\n", + w->wfSlotId); + + res_str += csprintf(" Kern ID: %i\n", w->kern_id); + res_str += csprintf(" Phase ID: %i\n", w->simdId); + res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); + res_str += csprintf(" Exec mask: "); + + for (int i = VSZ - 1; i >= 0; --i) { + if (w->execMask(i)) + res_str += "1"; + else + res_str += "0"; + + if ((i & 7) == 7) + res_str += " "; + } + + res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong()); + + res_str += "\nHelpful debugging hints:\n"; + res_str += " Check out w->s_reg / w->d_reg for register state\n"; + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + fflush(stdout); + + raise(SIGTRAP); + } + + void + Call::MagicPrefixSum(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + dest.set(w, lane, res); + res += src_val1; + } + } + } + + void + Call::MagicReduction(Wavefront *w) + { + // reduction magic instruction + // The reduction instruction takes up to 64 inputs (one from + // each thread in a WF) and sums them. It returns the sum to + // each thread in the WF. + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + res += src_val1; + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set(w, lane, res); + } + } + } + + void + Call::MagicMaskLower(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + + if (src_val1) { + if (lane < (VSZ/2)) { + res = res | ((uint32_t)(1) << lane); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set(w, lane, res); + } + } + } + + void + Call::MagicMaskUpper(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + + if (src_val1) { + if (lane >= (VSZ/2)) { + res = res | ((uint32_t)(1) << (lane - (VSZ/2))); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set(w, lane, res); + } + } + } + + void + Call::MagicJoinWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]++; + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + } + + if (max_cnt > w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + } + + void + Call::MagicWaitWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]--; + } + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + + if (max_cnt < w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) + w->dropFetch = true; + } + + void + Call::MagicPanic(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get(w, lane, 1); + panic("OpenCL Code failed assertion #%d. Triggered by lane %s", + src_val1, lane); + } + } + } + + void + Call::calcAddr(Wavefront *w, GPUDynInstPtr m) + { + // the address is in src1 | src2 + for (int lane = 0; lane < VSZ; ++lane) { + int src_val1 = src1.get(w, lane, 1); + int src_val2 = src1.get(w, lane, 2); + Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2); + + m->addr[lane] = addr; + } + + } + + void + Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get(w, lane, 3); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get(w, lane, 1); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + // calculate the address + calcAddr(w, m); + + m->m_op = Enums::MO_LD; + m->m_type = U32::memType; //MemDataType::memType; + m->v_type = U32::vgprType; //DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + // FIXME + //m->dst_reg = this->dest.regIndex(); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicXactCasLd(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int src_val1 = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + src_val1 = src1.get(w, lane, 1); + break; + } + } + + if (!w->computeUnit->xactCasLoadMap.count(src_val1)) { + w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue(); + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear(); + } + + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue + .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId)); + } + + void + Call::MagicMostSigThread(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + unsigned mst = true; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + dest.set(w, lane, mst); + mst = false; + } + } + } + + void + Call::MagicMostSigBroadcast(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + bool got_res = false; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + if (!got_res) { + res = src1.get(w, lane, 1); + got_res = true; + } + dest.set(w, lane, res); + } + } + } + +} // namespace HsailISA diff --git a/src/arch/hsail/operand.cc b/src/arch/hsail/operand.cc new file mode 100644 index 000000000..d0e6c5541 --- /dev/null +++ b/src/arch/hsail/operand.cc @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/operand.hh" + +using namespace Brig; + +bool +BaseRegOperand::init(unsigned opOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar) +{ + regFileChar = _regFileChar; + const BrigOperand *brigOp = obj->getOperand(opOffset); + + if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER) + return false; + + const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp; + + regIdx = brigRegOp->regNum; + + DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx, + brigRegOp->regKind); + + maxRegIdx = std::max(maxRegIdx, regIdx); + + return true; +} + +void +ListOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset); + + switch (brigOp->kind) { + case BRIG_KIND_OPERAND_CODE_LIST: + { + const BrigOperandCodeList *opList = + (const BrigOperandCodeList*)brigOp; + + const Brig::BrigData *oprnd_data = + obj->getBrigBaseData(opList->elements); + + // Note: for calls Dest list of operands could be size of 0. + elementCount = oprnd_data->byteCount / 4; + + DPRINTF(GPUReg, "Operand Code List: # elements: %d\n", + elementCount); + + for (int i = 0; i < elementCount; ++i) { + unsigned *data_offset = + (unsigned*)obj->getData(opList->elements + 4 * (i + 1)); + + const BrigDirectiveVariable *p = + (const BrigDirectiveVariable*)obj-> + getCodeSectionEntry(*data_offset); + + StorageElement *se = obj->currentCode->storageMap-> + findSymbol(BRIG_SEGMENT_ARG, p); + + assert(se); + callArgs.push_back(se); + } + } + break; + default: + fatal("ListOperand: bad operand kind %d\n", brigOp->kind); + } +} + +std::string +ListOperand::disassemble() +{ + std::string res_str(""); + + for (auto it : callArgs) { + res_str += csprintf("%s ", it->name.c_str()); + } + + return res_str; +} + +void +FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *baseOp = obj->getOperand(opOffset); + + if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) { + fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind); + } + + const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp; + + const BrigDirectiveExecutable *p = + (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref); + + func_name = obj->getString(p->name); +} + +std::string +FunctionRefOperand::disassemble() +{ + DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name); + + return csprintf("%s", func_name); +} + +bool +BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj, + int at, unsigned &maxRegIdx, char _regFileChar) +{ + regFileChar = _regFileChar; + const BrigOperand *brigOp = obj->getOperand(opOffset); + + if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST) + return false; + + + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)brigOp; + + unsigned *data_offset = + (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1)); + + const BrigOperand *p = + (const BrigOperand*)obj->getOperand(*data_offset); + if (p->kind != BRIG_KIND_OPERAND_REGISTER) { + return false; + } + + const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p; + + regIdx = brigRegOp->regNum; + + DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx, + brigRegOp->regKind); + + maxRegIdx = std::max(maxRegIdx, regIdx); + + return true; +} + +void +BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar) +{ + const char *name = obj->getString(strOffset); + char *endptr; + regIdx = strtoul(name + 2, &endptr, 10); + + if (name[0] != '$' || name[1] != _regFileChar) { + fatal("register operand parse error on \"%s\"\n", name); + } + + maxRegIdx = std::max(maxRegIdx, regIdx); +} + +unsigned SRegOperand::maxRegIdx; +unsigned DRegOperand::maxRegIdx; +unsigned CRegOperand::maxRegIdx; + +std::string +SRegOperand::disassemble() +{ + return csprintf("$s%d", regIdx); +} + +std::string +DRegOperand::disassemble() +{ + return csprintf("$d%d", regIdx); +} + +std::string +CRegOperand::disassemble() +{ + return csprintf("$c%d", regIdx); +} + +BrigRegOperandInfo +findRegDataType(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *baseOp = obj->getOperand(opOffset); + + switch (baseOp->kind) { + case BRIG_KIND_OPERAND_REGISTER: + { + const BrigOperandRegister *op = (BrigOperandRegister*)baseOp; + + return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, + (BrigRegisterKind)op->regKind); + } + break; + + case BRIG_KIND_OPERAND_OPERAND_LIST: + { + const BrigOperandOperandList *op = + (BrigOperandOperandList*)baseOp; + const BrigData *data_p = (BrigData*)obj->getData(op->elements); + + + int num_operands = 0; + BrigRegisterKind reg_kind = (BrigRegisterKind)0; + for (int offset = 0; offset < data_p->byteCount; offset += 4) { + const BrigOperand *op_p = (const BrigOperand *) + obj->getOperand(((int *)data_p->bytes)[offset/4]); + + if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) { + const BrigOperandRegister *brigRegOp = + (const BrigOperandRegister*)op_p; + reg_kind = (BrigRegisterKind)brigRegOp->regKind; + } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) { + uint16_t num_bytes = + ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount + - sizeof(BrigBase); + if (num_bytes == sizeof(uint32_t)) { + reg_kind = BRIG_REGISTER_KIND_SINGLE; + } else if (num_bytes == sizeof(uint64_t)) { + reg_kind = BRIG_REGISTER_KIND_DOUBLE; + } else { + fatal("OperandList: bad operand size %d\n", num_bytes); + } + } else { + fatal("OperandList: bad operand kind %d\n", op_p->kind); + } + + num_operands++; + } + assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST); + + return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind); + } + break; + + case BRIG_KIND_OPERAND_ADDRESS: + { + const BrigOperandAddress *op = (BrigOperandAddress*)baseOp; + + if (!op->reg) { + BrigType type = BRIG_TYPE_NONE; + + if (op->symbol) { + const BrigDirective *dir = (BrigDirective*) + obj->getCodeSectionEntry(op->symbol); + + assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE); + + const BrigDirectiveVariable *sym = + (const BrigDirectiveVariable*)dir; + + type = (BrigType)sym->type; + } + return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS, + (BrigType)type); + } else { + const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp; + const BrigOperand *reg = obj->getOperand(b->reg); + const BrigOperandRegister *rop = (BrigOperandRegister*)reg; + + return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER, + (BrigRegisterKind)rop->regKind); + } + } + break; + + default: + fatal("AddrOperand: bad operand kind %d\n", baseOp->kind); + break; + } +} + +void +AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj) +{ + assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS); + + const BrigDirective *d = + (BrigDirective*)obj->getCodeSectionEntry(op->symbol); + + assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE); + const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d; + name = obj->getString(sym->name); + + if (sym->segment != BRIG_SEGMENT_ARG) { + storageElement = + obj->currentCode->storageMap->findSymbol(sym->segment, name); + assert(storageElement); + offset = 0; + } else { + // sym->name does not work for BRIG_SEGMENT_ARG for the following case: + // + // void foo(int a); + // void bar(double a); + // + // foo(...) --> arg_u32 %param_p0; + // st_arg_u32 $s0, [%param_p0]; + // call &foo (%param_p0); + // bar(...) --> arg_f64 %param_p0; + // st_arg_u64 $d0, [%param_p0]; + // call &foo (%param_p0); + // + // Both functions use the same variable name (param_p0)!!! + // + // Maybe this is a bug in the compiler (I don't know). + // + // Solution: + // Use directive pointer (BrigDirectiveVariable) to differentiate 2 + // versions of param_p0. + // + // Note this solution is kind of stupid, because we are pulling stuff + // out of the brig binary via the directive pointer and putting it into + // the symbol table, but now we are indexing the symbol table by the + // brig directive pointer! It makes the symbol table sort of pointless. + // But I don't want to mess with the rest of the infrastructure, so + // let's go with this for now. + // + // When we update the compiler again, we should see if this problem goes + // away. If so, we can fold some of this functionality into the code for + // kernel arguments. If not, maybe we can index the symbol name on a + // hash of the variable AND function name + storageElement = obj->currentCode-> + storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym); + + assert(storageElement); + } +} + +uint64_t +AddrOperandBase::calcUniformBase() +{ + // start with offset, will be 0 if not specified + uint64_t address = offset; + + // add in symbol value if specified + if (storageElement) { + address += storageElement->offset; + } + + return address; +} + +std::string +AddrOperandBase::disassemble(std::string reg_disassembly) +{ + std::string disasm; + + if (offset || reg_disassembly != "") { + disasm += "["; + + if (reg_disassembly != "") { + disasm += reg_disassembly; + + if (offset > 0) { + disasm += "+"; + } + } + + if (offset) { + disasm += csprintf("%d", offset); + } + + disasm += "]"; + } else if (name) { + disasm += csprintf("[%s]", name); + } + + return disasm; +} + +void +NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *baseOp = obj->getOperand(opOffset); + + if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) { + BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp; + parseAddr(addrOp, obj); + offset = (uint64_t(addrOp->offset.hi) << 32) | + uint64_t(addrOp->offset.lo); + } else { + fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind); + } + +} + +std::string +NoRegAddrOperand::disassemble() +{ + return AddrOperandBase::disassemble(std::string("")); +} + +void +LabelOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperandCodeRef *op = + (const BrigOperandCodeRef*)obj->getOperand(opOffset); + + assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF); + + const BrigDirective *dir = + (const BrigDirective*)obj->getCodeSectionEntry(op->ref); + + assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL); + label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj); +} + +uint32_t +LabelOperand::getTarget(Wavefront *w, int lane) +{ + return label->get(); +} + +std::string +LabelOperand::disassemble() +{ + return label->name; +} diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh new file mode 100644 index 000000000..e3d275b10 --- /dev/null +++ b/src/arch/hsail/operand.hh @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_OPERAND_HH__ +#define __ARCH_HSAIL_OPERAND_HH__ + +/** + * @file operand.hh + * + * Defines classes encapsulating HSAIL instruction operands. + */ + +#include + +#include "arch/hsail/Brig.h" +#include "base/trace.hh" +#include "base/types.hh" +#include "debug/GPUReg.hh" +#include "enums/RegisterType.hh" +#include "gpu-compute/brig_object.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/hsail_code.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +class Label; +class StorageElement; + +class BaseOperand +{ + public: + Enums::RegisterType registerType; + uint32_t regOperandSize; + BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; } + bool isVectorRegister() { return registerType == Enums::RT_VECTOR; } + bool isScalarRegister() { return registerType == Enums::RT_SCALAR; } + bool isCondRegister() { return registerType == Enums::RT_CONDITION; } + unsigned int regIndex() { return 0; } + uint32_t opSize() { return regOperandSize; } + virtual ~BaseOperand() { } +}; + +class BrigRegOperandInfo +{ + public: + Brig::BrigKind16_t kind; + Brig::BrigType type; + Brig::BrigRegisterKind regKind; + + BrigRegOperandInfo(Brig::BrigKind16_t _kind, + Brig::BrigRegisterKind _regKind) + : kind(_kind), regKind(_regKind) + { + } + + BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type) + : kind(_kind), type(_type) + { + } + + BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES), + type(Brig::BRIG_TYPE_NONE) + { + } +}; + +BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj); + +class BaseRegOperand : public BaseOperand +{ + public: + unsigned regIdx; + char regFileChar; + + bool init(unsigned opOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar); + + bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at, + unsigned &maxRegIdx, char _regFileChar); + + void initWithStrOffset(unsigned strOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar); + unsigned int regIndex() { return regIdx; } +}; + +class SRegOperand : public BaseRegOperand +{ + public: + static unsigned maxRegIdx; + + bool + init(unsigned opOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's'); + } + + bool + init_from_vect(unsigned opOffset, const BrigObject *obj, int at) + { + regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx, + 's'); + } + + void + initWithStrOffset(unsigned strOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx, + 's'); + } + + template + OperandType + get(Wavefront *w, int lane) + { + assert(sizeof(OperandType) <= sizeof(uint32_t)); + assert(regIdx < w->maxSpVgprs); + // if OperandType is smaller than 32-bit, we truncate the value + OperandType ret; + uint32_t vgprIdx; + + switch (sizeof(OperandType)) { + case 1: // 1 byte operand + vgprIdx = w->remap(regIdx, 1, 1); + ret = (w->computeUnit->vrf[w->simdId]-> + read(vgprIdx, lane)) & 0xff; + break; + case 2: // 2 byte operand + vgprIdx = w->remap(regIdx, 2, 1); + ret = (w->computeUnit->vrf[w->simdId]-> + read(vgprIdx, lane)) & 0xffff; + break; + case 4: // 4 byte operand + vgprIdx = w->remap(regIdx,sizeof(OperandType), 1); + ret = w->computeUnit->vrf[w->simdId]-> + read(vgprIdx, lane); + break; + default: + panic("Bad OperandType\n"); + break; + } + + return (OperandType)ret; + } + + // special get method for compatibility with LabelOperand + uint32_t + getTarget(Wavefront *w, int lane) + { + return get(w, lane); + } + + template + void set(Wavefront *w, int lane, OperandType &val); + std::string disassemble(); +}; + +template +void +SRegOperand::set(Wavefront *w, int lane, OperandType &val) +{ + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val); + + assert(sizeof(OperandType) == sizeof(uint32_t)); + assert(regIdx < w->maxSpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1); + w->computeUnit->vrf[w->simdId]->write(vgprIdx,val,lane); +} + +template<> +inline void +SRegOperand::set(Wavefront *w, int lane, uint64_t &val) +{ + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val); + + assert(regIdx < w->maxSpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1); + w->computeUnit->vrf[w->simdId]->write(vgprIdx, val, lane); +} + +class DRegOperand : public BaseRegOperand +{ + public: + static unsigned maxRegIdx; + + bool + init(unsigned opOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd'); + } + + bool + init_from_vect(unsigned opOffset, const BrigObject *obj, int at) + { + regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx, + 'd'); + } + + void + initWithStrOffset(unsigned strOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx, + 'd'); + } + + template + OperandType + get(Wavefront *w, int lane) + { + assert(sizeof(OperandType) <= sizeof(uint64_t)); + // TODO: this check is valid only for HSAIL + assert(regIdx < w->maxDpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1); + + return w->computeUnit->vrf[w->simdId]->read(vgprIdx,lane); + } + + template + void + set(Wavefront *w, int lane, OperandType &val) + { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, + val); + + assert(sizeof(OperandType) <= sizeof(uint64_t)); + // TODO: this check is valid only for HSAIL + assert(regIdx < w->maxDpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1); + w->computeUnit->vrf[w->simdId]->write(vgprIdx,val,lane); + } + + std::string disassemble(); +}; + +class CRegOperand : public BaseRegOperand +{ + public: + static unsigned maxRegIdx; + + bool + init(unsigned opOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint8_t); + registerType = Enums::RT_CONDITION; + + return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c'); + } + + bool + init_from_vect(unsigned opOffset, const BrigObject *obj, int at) + { + regOperandSize = sizeof(uint8_t); + registerType = Enums::RT_CONDITION; + + return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx, + 'c'); + } + + void + initWithStrOffset(unsigned strOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint8_t); + registerType = Enums::RT_CONDITION; + + return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx, + 'c'); + } + + template + OperandType + get(Wavefront *w, int lane) + { + assert(regIdx < w->condRegState->numRegs()); + + return w->condRegState->read((int)regIdx, lane); + } + + template + void + set(Wavefront *w, int lane, OperandType &val) + { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, + val); + + assert(regIdx < w->condRegState->numRegs()); + w->condRegState->write(regIdx,lane,val); + } + + std::string disassemble(); +}; + +template +class ImmOperand : public BaseOperand +{ + public: + T bits; + + bool init(unsigned opOffset, const BrigObject *obj); + bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at); + std::string disassemble(); + + template + OperandType + get() + { + assert(sizeof(OperandType) <= sizeof(T)); + + return *(OperandType*)&bits; + } + + // This version of get() takes a WF* and a lane id for + // compatibility with the register-based get() methods. + template + OperandType + get(Wavefront *w, int lane) + { + return get(); + } +}; + +template +bool +ImmOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const Brig::BrigOperand *brigOp = obj->getOperand(opOffset); + + switch (brigOp->kind) { + // this is immediate operand + case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES: + { + DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T), + brigOp->byteCount); + + auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp; + + bits = *((T*)(obj->getData(cbptr->bytes + 4))); + + return true; + } + break; + + case Brig::BRIG_KIND_OPERAND_WAVESIZE: + bits = VSZ; + return true; + + default: + return false; + } +} + +template +bool +ImmOperand::init_from_vect(unsigned opOffset, const BrigObject *obj, int at) +{ + const Brig::BrigOperand *brigOp = obj->getOperand(opOffset); + + if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + return false; + } + + + const Brig::BrigOperandOperandList *brigVecOp = + (const Brig::BrigOperandOperandList *)brigOp; + + unsigned *data_offset = + (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1)); + + const Brig::BrigOperand *p = + (const Brig::BrigOperand *)obj->getOperand(*data_offset); + + if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + return false; + } + + return init(*data_offset, obj); +} +template +std::string +ImmOperand::disassemble() +{ + return csprintf("0x%08x", bits); +} + +template +class RegOrImmOperand : public BaseOperand +{ + private: + bool is_imm; + + public: + void setImm(const bool value) { is_imm = value; } + + ImmOperand imm_op; + RegOperand reg_op; + + RegOrImmOperand() { is_imm = false; } + void init(unsigned opOffset, const BrigObject *obj); + void init_from_vect(unsigned opOffset, const BrigObject *obj, int at); + std::string disassemble(); + + template + OperandType + get(Wavefront *w, int lane) + { + return is_imm ? imm_op.template get() : + reg_op.template get(w, lane); + } + + uint32_t + opSize() + { + if (!is_imm) { + return reg_op.opSize(); + } + + return 0; + } + + bool + isVectorRegister() + { + if (!is_imm) { + return reg_op.registerType == Enums::RT_VECTOR; + } + return false; + } + + bool + isCondRegister() + { + if (!is_imm) { + return reg_op.registerType == Enums::RT_CONDITION; + } + + return false; + } + + bool + isScalarRegister() + { + if (!is_imm) { + return reg_op.registerType == Enums::RT_SCALAR; + } + + return false; + } + + unsigned int + regIndex() + { + if (!is_imm) { + return reg_op.regIndex(); + } + return 0; + } +}; + +template +void +RegOrImmOperand::init(unsigned opOffset, const BrigObject *obj) +{ + is_imm = false; + + if (reg_op.init(opOffset, obj)) { + return; + } + + if (imm_op.init(opOffset, obj)) { + is_imm = true; + return; + } + + fatal("RegOrImmOperand::init(): bad operand kind %d\n", + obj->getOperand(opOffset)->kind); +} + +template +void +RegOrImmOperand::init_from_vect(unsigned opOffset, + const BrigObject *obj, int at) +{ + if (reg_op.init_from_vect(opOffset, obj, at)) { + is_imm = false; + + return; + } + + if (imm_op.init_from_vect(opOffset, obj, at)) { + is_imm = true; + + return; + } + + fatal("RegOrImmOperand::init(): bad operand kind %d\n", + obj->getOperand(opOffset)->kind); +} + +template +std::string +RegOrImmOperand::disassemble() +{ + return is_imm ? imm_op.disassemble() : reg_op.disassemble(); +} + +typedef RegOrImmOperand SRegOrImmOperand; +typedef RegOrImmOperand DRegOrImmOperand; +typedef RegOrImmOperand CRegOrImmOperand; + +class AddrOperandBase : public BaseOperand +{ + protected: + // helper function for init() + void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj); + + // helper function for disassemble() + std::string disassemble(std::string reg_disassembly); + uint64_t calcUniformBase(); + + public: + virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0; + virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0; + + uint64_t offset; + const char *name = nullptr; + StorageElement *storageElement; +}; + +template +class RegAddrOperand : public AddrOperandBase +{ + public: + RegOperandType reg; + void init(unsigned opOffset, const BrigObject *obj); + uint64_t calcUniform(); + void calcVector(Wavefront *w, uint64_t *addrVec); + uint64_t calcLane(Wavefront *w, int lane=0); + uint32_t opSize() { return reg.opSize(); } + bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; } + bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; } + bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; } + unsigned int regIndex() { return reg.regIndex(); } + std::string disassemble(); +}; + +template +void +RegAddrOperand::init(unsigned opOffset, const BrigObject *obj) +{ + using namespace Brig; + + const BrigOperand *baseOp = obj->getOperand(opOffset); + + switch (baseOp->kind) { + case BRIG_KIND_OPERAND_ADDRESS: + { + const BrigOperandAddress *op = (BrigOperandAddress*)baseOp; + storageElement = nullptr; + + offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo); + reg.init(op->reg, obj); + + if (reg.regFileChar == 's') { + reg.regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + } + else if (reg.regFileChar == 'd') { + reg.regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + } + } + break; + + default: + fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind); + break; + } +} + +template +uint64_t +RegAddrOperand::calcUniform() +{ + fatal("can't do calcUniform() on register-based address\n"); + + return 0; +} + +template +void +RegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec) +{ + Addr address = calcUniformBase(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (w->execMask(lane)) { + if (reg.regFileChar == 's') { + addrVec[lane] = address + reg.template get(w, lane); + } else { + addrVec[lane] = address + reg.template get(w, lane); + } + } + } +} + +template +uint64_t +RegAddrOperand::calcLane(Wavefront *w, int lane) +{ + Addr address = calcUniformBase(); + + return address + reg.template get(w, lane); +} + +template +std::string +RegAddrOperand::disassemble() +{ + return AddrOperandBase::disassemble(reg.disassemble()); +} + +typedef RegAddrOperand SRegAddrOperand; +typedef RegAddrOperand DRegAddrOperand; + +class NoRegAddrOperand : public AddrOperandBase +{ + public: + void init(unsigned opOffset, const BrigObject *obj); + uint64_t calcUniform(); + void calcVector(Wavefront *w, uint64_t *addrVec); + uint64_t calcLane(Wavefront *w, int lane=0); + std::string disassemble(); +}; + +inline uint64_t +NoRegAddrOperand::calcUniform() +{ + return AddrOperandBase::calcUniformBase(); +} + +inline uint64_t +NoRegAddrOperand::calcLane(Wavefront *w, int lane) +{ + return calcUniform(); +} + +inline void +NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec) +{ + uint64_t address = calcUniformBase(); + + for (int lane = 0; lane < VSZ; ++lane) + addrVec[lane] = address; +} + +class LabelOperand : public BaseOperand +{ + public: + Label *label; + + void init(unsigned opOffset, const BrigObject *obj); + std::string disassemble(); + + // special get method for compatibility with SRegOperand + uint32_t getTarget(Wavefront *w, int lane); + +}; + +class ListOperand : public BaseOperand +{ + public: + int elementCount; + std::vector callArgs; + + int + getSrcOperand(int idx) + { + DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx, + callArgs.size()); + + return callArgs.at(idx)->offset; + } + + void init(unsigned opOffset, const BrigObject *obj); + + std::string disassemble(); + + template + OperandType + get(Wavefront *w, int lane, int arg_idx) + { + return w->readCallArgMem(lane, getSrcOperand(arg_idx)); + } + + template + void + set(Wavefront *w, int lane, OperandType val) + { + w->writeCallArgMem(lane, getSrcOperand(0), val); + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, + getSrcOperand(0), val); + } +}; + +class FunctionRefOperand : public BaseOperand +{ + public: + const char *func_name; + + void init(unsigned opOffset, const BrigObject *obj); + std::string disassemble(); +}; + +#endif // __ARCH_HSAIL_OPERAND_HH__ diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py new file mode 100644 index 000000000..bd95f6335 --- /dev/null +++ b/src/gpu-compute/GPU.py @@ -0,0 +1,310 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Steve Reinhardt +# + +from ClockedObject import ClockedObject +from Device import DmaDevice +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject +from MemObject import MemObject +from Process import EmulatedDriver +from Bridge import Bridge +from LdsState import LdsState + +class PrefetchType(Enum): vals = [ + 'PF_CU', + 'PF_PHASE', + 'PF_WF', + 'PF_STRIDE', + 'PF_END', + ] + +class VectorRegisterFile(SimObject): + type = 'VectorRegisterFile' + cxx_class = 'VectorRegisterFile' + cxx_header = 'gpu-compute/vector_register_file.hh' + + simd_id = Param.Int(0, 'SIMD ID associated with this VRF') + num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') + min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') + +class Wavefront(SimObject): + type = 'Wavefront' + cxx_class = 'Wavefront' + cxx_header = 'gpu-compute/wavefront.hh' + + simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') + wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') + +class ComputeUnit(MemObject): + type = 'ComputeUnit' + cxx_class = 'ComputeUnit' + cxx_header = 'gpu-compute/compute_unit.hh' + + wavefronts = VectorParam.Wavefront('Number of wavefronts') + wfSize = Param.Int(64, 'Wavefront size (in work items)') + num_SIMDs = Param.Int(4, 'number of SIMD units per CU') + + spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ + 'latency') + + dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ + 'latency') + + issue_period = Param.Int(4, 'number of cycles per issue period') + num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') + num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') + n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ + "Represents the pipeline to reach the TCP and "\ + "specified in GPU clock cycles") + mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ + "cu. Represents the pipeline between the TCP "\ + "and cu as well as TCP data array access. "\ + "Specified in GPU clock cycles") + system = Param.System(Parent.any, "system object") + cu_id = Param.Int('CU id') + vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ + "in bytes") + coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ + "in bytes") + + memory_port = VectorMasterPort("Port to the memory system") + translation_port = VectorMasterPort('Port to the TLB hierarchy') + sqc_port = MasterPort("Port to the SQC (I-cache") + sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") + perLaneTLB = Param.Bool(False, "enable per-lane TLB") + prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ + "(0 turns off prefetching)") + prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") + prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\ + "from last mem req in lane of "\ + "CU|Phase|Wavefront") + execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); + xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); + debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") + functionalTLB = Param.Bool(False, "Assume TLB causes no delay") + + localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ + "kernel end") + + countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ + "and how many times") + global_mem_queue_size = Param.Int(256, "Number of entries in the global " + "memory pipeline's queues") + local_mem_queue_size = Param.Int(256, "Number of entries in the local " + "memory pipeline's queues") + ldsBus = Bridge() # the bridge between the CU and its LDS + ldsPort = MasterPort("The port that goes to the LDS") + localDataStore = Param.LdsState("the LDS for this CU") + + vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ + "file") + +class Shader(ClockedObject): + type = 'Shader' + cxx_class = 'Shader' + cxx_header = 'gpu-compute/shader.hh' + + CUs = VectorParam.ComputeUnit('Number of compute units') + n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into + ruby at kernel boundaries""") + separate_acquire_release = Param.Bool(False, + """Do ld_acquire/st_release generate separate requests for the + acquire and release?""") + globalmem = Param.MemorySize('64kB', 'Memory size') + timing = Param.Bool(False, 'timing memory accesses') + + cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") + translation = Param.Bool(False, "address translation"); + +class ClDriver(EmulatedDriver): + type = 'ClDriver' + cxx_header = 'gpu-compute/cl_driver.hh' + codefile = VectorParam.String('code file name(s)') + +class GpuDispatcher(DmaDevice): + type = 'GpuDispatcher' + cxx_header = 'gpu-compute/dispatcher.hh' + # put at 8GB line for now + pio_addr = Param.Addr(0x200000000, "Device Address") + pio_latency = Param.Latency('1ns', "Programmed IO latency") + shader_pointer = Param.Shader('pointer to shader') + translation_port = MasterPort('Port to the dispatcher TLB') + cpu = Param.BaseCPU("CPU to wake up on kernel completion") + + cl_driver = Param.ClDriver('pointer to driver') + +class OpType(Enum): vals = [ + 'OT_NULL', + 'OT_ALU', + 'OT_SPECIAL', + 'OT_GLOBAL_READ', + 'OT_GLOBAL_WRITE', + 'OT_GLOBAL_ATOMIC', + 'OT_GLOBAL_HIST', + 'OT_GLOBAL_LDAS', + 'OT_SHARED_READ', + 'OT_SHARED_WRITE', + 'OT_SHARED_ATOMIC', + 'OT_SHARED_HIST', + 'OT_SHARED_LDAS', + 'OT_PRIVATE_READ', + 'OT_PRIVATE_WRITE', + 'OT_PRIVATE_ATOMIC', + 'OT_PRIVATE_HIST', + 'OT_PRIVATE_LDAS', + 'OT_SPILL_READ', + 'OT_SPILL_WRITE', + 'OT_SPILL_ATOMIC', + 'OT_SPILL_HIST', + 'OT_SPILL_LDAS', + 'OT_READONLY_READ', + 'OT_READONLY_WRITE', + 'OT_READONLY_ATOMIC', + 'OT_READONLY_HIST', + 'OT_READONLY_LDAS', + 'OT_FLAT_READ', + 'OT_FLAT_WRITE', + 'OT_FLAT_ATOMIC', + 'OT_FLAT_HIST', + 'OT_FLAT_LDAS', + 'OT_KERN_READ', + 'OT_BRANCH', + + # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version + # of the compiler. + 'OT_SHARED_MEMFENCE', + 'OT_GLOBAL_MEMFENCE', + 'OT_BOTH_MEMFENCE', + + 'OT_BARRIER', + 'OT_PRINT', + 'OT_RET', + 'OT_NOP', + 'OT_ARG' + ] + +class MemType(Enum): vals = [ + 'M_U8', + 'M_U16', + 'M_U32', + 'M_U64', + 'M_S8', + 'M_S16', + 'M_S32', + 'M_S64', + 'M_F16', + 'M_F32', + 'M_F64', + ] + +class MemOpType(Enum): vals = [ + 'MO_LD', + 'MO_ST', + 'MO_LDAS', + 'MO_LDA', + 'MO_AAND', + 'MO_AOR', + 'MO_AXOR', + 'MO_ACAS', + 'MO_AEXCH', + 'MO_AADD', + 'MO_ASUB', + 'MO_AINC', + 'MO_ADEC', + 'MO_AMAX', + 'MO_AMIN', + 'MO_ANRAND', + 'MO_ANROR', + 'MO_ANRXOR', + 'MO_ANRCAS', + 'MO_ANREXCH', + 'MO_ANRADD', + 'MO_ANRSUB', + 'MO_ANRINC', + 'MO_ANRDEC', + 'MO_ANRMAX', + 'MO_ANRMIN', + 'MO_HAND', + 'MO_HOR', + 'MO_HXOR', + 'MO_HCAS', + 'MO_HEXCH', + 'MO_HADD', + 'MO_HSUB', + 'MO_HINC', + 'MO_HDEC', + 'MO_HMAX', + 'MO_HMIN', + 'MO_UNDEF' + ] + +class StorageClassType(Enum): vals = [ + 'SC_SPILL', + 'SC_GLOBAL', + 'SC_SHARED', + 'SC_PRIVATE', + 'SC_READONLY', + 'SC_KERNARG', + 'SC_NONE', + ] + +class RegisterType(Enum): vals = [ + 'RT_VECTOR', + 'RT_SCALAR', + 'RT_CONDITION', + 'RT_HARDWARE', + 'RT_NONE', + ] + +class GenericMemoryOrder(Enum): vals = [ + 'MEMORY_ORDER_NONE', + 'MEMORY_ORDER_RELAXED', + 'MEMORY_ORDER_SC_ACQUIRE', + 'MEMORY_ORDER_SC_RELEASE', + 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', + ] + +class GenericMemoryScope(Enum): vals = [ + 'MEMORY_SCOPE_NONE', + 'MEMORY_SCOPE_WORKITEM', + 'MEMORY_SCOPE_WAVEFRONT', + 'MEMORY_SCOPE_WORKGROUP', + 'MEMORY_SCOPE_DEVICE', + 'MEMORY_SCOPE_SYSTEM', + ] diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py new file mode 100644 index 000000000..6ea9f6427 --- /dev/null +++ b/src/gpu-compute/LdsState.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Joe Gross +# + +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * + +from MemObject import MemObject + +class LdsState(MemObject): + type = 'LdsState' + cxx_class = 'LdsState' + cxx_header = 'gpu-compute/lds_state.hh' + size = Param.Int(65536, 'the size of the LDS') + range = Param.AddrRange('64kB', "address space of the LDS") + bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\ + 'accessing data') + banks = Param.Int(32, 'Number of LDS banks') + cuPort = SlavePort("port that goes to the compute unit") diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript new file mode 100644 index 000000000..2de96df24 --- /dev/null +++ b/src/gpu-compute/SConscript @@ -0,0 +1,99 @@ +# -*- mode:python -*- + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +if not env['BUILD_GPU']: + Return() + +SimObject('GPU.py') +SimObject('LdsState.py') +SimObject('X86GPUTLB.py') + +if env['TARGET_GPU_ISA'] == 'hsail': + Source('brig_object.cc') + Source('hsail_code.cc') + +Source('cl_driver.cc') +Source('compute_unit.cc') +Source('condition_register_state.cc') +Source('dispatcher.cc') +Source('exec_stage.cc') +Source('fetch_stage.cc') +Source('fetch_unit.cc') +Source('global_memory_pipeline.cc') +Source('gpu_dyn_inst.cc') +Source('gpu_exec_context.cc') +Source('gpu_static_inst.cc') +Source('gpu_tlb.cc') +Source('hsa_object.cc') +Source('kernel_cfg.cc') +Source('lds_state.cc') +Source('local_memory_pipeline.cc') +Source('of_scheduling_policy.cc') +Source('pool_manager.cc') +Source('rr_scheduling_policy.cc') +Source('schedule_stage.cc') +Source('scheduler.cc') +Source('scoreboard_check_stage.cc') +Source('shader.cc') +Source('simple_pool_manager.cc') +Source('tlb_coalescer.cc') +Source('vector_register_file.cc') +Source('vector_register_state.cc') +Source('wavefront.cc') + +DebugFlag('BRIG') +DebugFlag('GPUCoalescer') +DebugFlag('GPUDisp') +DebugFlag('GPUExec') +DebugFlag('GPUFetch') +DebugFlag('GPUHsailCFInfo') +DebugFlag('GPUMem') +DebugFlag('GPUPort') +DebugFlag('GPUPrefetch') +DebugFlag('GPUReg') +DebugFlag('GPUSync') +DebugFlag('GPUTLB') +DebugFlag('HSALoader') +DebugFlag('HSAIL') +DebugFlag('HSAILObject') +DebugFlag('Predictor') +DebugFlag('WavefrontStack') + +CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch', + 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL']) diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py new file mode 100644 index 000000000..51f8e514e --- /dev/null +++ b/src/gpu-compute/X86GPUTLB.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * + +from m5.objects.MemObject import MemObject + +if buildEnv['FULL_SYSTEM']: + class X86PagetableWalker(MemObject): + type = 'X86PagetableWalker' + cxx_class = 'X86ISA::Walker' + port = SlavePort("Port for the hardware table walker") + system = Param.System(Parent.any, "system object") + +class X86GPUTLB(MemObject): + type = 'X86GPUTLB' + cxx_class = 'X86ISA::GpuTLB' + cxx_header = 'gpu-compute/gpu_tlb.hh' + size = Param.Int(64, "TLB size (number of entries)") + assoc = Param.Int(64, "TLB associativity") + + if buildEnv['FULL_SYSTEM']: + walker = Param.X86PagetableWalker(X86PagetableWalker(), + "page table walker") + + hitLatency = Param.Int(2, "Latency of a TLB hit") + missLatency1 = Param.Int(5, "Latency #1 of a TLB miss") + missLatency2 = Param.Int(100, "Latency #2 of a TLB miss") + maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests") + slave = VectorSlavePort("Port on side closer to CPU/CU") + master = VectorMasterPort("Port on side closer to memory") + allocationPolicy = Param.Bool(True, "Allocate on an access") + accessDistance = Param.Bool(False, "print accessDistance stats") + +class TLBCoalescer(MemObject): + type = 'TLBCoalescer' + cxx_class = 'TLBCoalescer' + cxx_header = 'gpu-compute/tlb_coalescer.hh' + probesPerCycle = Param.Int(2, "Number of TLB probes per cycle") + coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks") + slave = VectorSlavePort("Port on side closer to CPU/CU") + master = VectorMasterPort("Port on side closer to memory") + disableCoalescing = Param.Bool(False,"Dispable Coalescing") diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc new file mode 100644 index 000000000..7cc9b7cc4 --- /dev/null +++ b/src/gpu-compute/brig_object.cc @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt, Anthony Gutierrez + */ + +#include "gpu-compute/brig_object.hh" + +#include +#include +#include +#include + +#include +#include +#include + +#include "arch/hsail/Brig.h" +#include "base/misc.hh" +#include "base/trace.hh" +#include "debug/BRIG.hh" +#include "debug/HSAILObject.hh" +#include "debug/HSALoader.hh" + +using namespace Brig; + +std::vector> + HsaObject::tryFileFuncs = { BrigObject::tryFile }; + +extern int getBrigDataTypeBytes(BrigType16_t t); + +const char *BrigObject::sectionNames[] = +{ + "hsa_data", + "hsa_code", + "hsa_operand", + ".shstrtab" +}; + +const char *segmentNames[] = +{ + "none", + "flat", + "global", + "readonly", + "kernarg", + "group", + "private", + "spill", + "args" +}; + +const uint8_t* +BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const +{ + // allow offs == size for dummy end pointers + assert(offs <= sectionInfo[sec].size); + + return sectionInfo[sec].ptr + offs; +} + +const char* +BrigObject::getString(int offs) const +{ + return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4); +} + +const BrigBase* +BrigObject::getCodeSectionEntry(int offs) const +{ + return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs); +} + +const BrigData* +BrigObject::getBrigBaseData(int offs) const +{ + return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs)); +} + +const uint8_t* +BrigObject::getData(int offs) const +{ + return getSectionOffset(DataSectionIndex, offs); +} + +const BrigOperand* +BrigObject::getOperand(int offs) const +{ + return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs); +} + +unsigned +BrigObject::getOperandPtr(int offs, int index) const +{ + unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1))); + + return *op_offs; +} + +const BrigInstBase* +BrigObject::getInst(int offs) const +{ + return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs); +} + +HsaCode* +BrigObject::getKernel(const std::string &name) const +{ + return nullptr; +} + +HsaCode* +BrigObject::getFunction(const std::string &name) const +{ + for (int i = 0; i < functions.size(); ++i) { + if (functions[i]->name() == name) { + return functions[i]; + } + } + + return nullptr; +} + +void +BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr, + StorageMap *storageMap) +{ + while (dirPtr < endPtr) { + if (!dirPtr->byteCount) { + fatal("Bad directive size 0\n"); + } + + // calculate next pointer now so we can override it if needed + const BrigBase *nextDirPtr = brigNext(dirPtr); + + DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n", + dirPtr->kind, dirPtr->byteCount); + + switch (dirPtr->kind) { + case BRIG_KIND_DIRECTIVE_FUNCTION: + { + const BrigDirectiveExecutable *p M5_VAR_USED = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: " + "%d next: %d\n", getString(p->name), + p->firstCodeBlockEntry, p->nextModuleEntry); + + if (p->firstCodeBlockEntry != p->nextModuleEntry) { + panic("Function calls are not fully supported yet!!: %s\n", + getString(p->name)); + + const char *name = getString(p->name); + + HsailCode *code_obj = nullptr; + + for (int i = 0; i < functions.size(); ++i) { + if (functions[i]->name() == name) { + code_obj = functions[i]; + break; + } + } + + if (!code_obj) { + // create new local storage map for kernel-local symbols + code_obj = new HsailCode(name, p, this, + new StorageMap(storageMap)); + functions.push_back(code_obj); + } else { + panic("Multiple definition of Function!!: %s\n", + getString(p->name)); + } + + } + nextDirPtr = getCodeSectionEntry(p->nextModuleEntry); + } + break; + + case BRIG_KIND_DIRECTIVE_KERNEL: + { + const BrigDirectiveExecutable *p = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: " + "next: %d\n", getString(p->name), + p->firstCodeBlockEntry, p->nextModuleEntry); + + const char *name = getString(p->name); + + if (name[0] == '&') + name++; + + std::string str = name; + char *temp; + int len = str.length(); + + if (str[len - 1] >= 'a' && str[len - 1] <= 'z') { + temp = new char[str.size() + 1]; + std::copy(str.begin(), str.end() , temp); + temp[str.size()] = '\0'; + } else { + temp = new char[str.size()]; + std::copy(str.begin(), str.end() - 1 , temp); + temp[str.size() - 1 ] = '\0'; + } + + std::string kernel_name = temp; + delete[] temp; + + HsailCode *code_obj = nullptr; + + for (const auto &kernel : kernels) { + if (kernel->name() == kernel_name) { + code_obj = kernel; + break; + } + } + + if (!code_obj) { + // create new local storage map for kernel-local symbols + code_obj = new HsailCode(kernel_name, p, this, + new StorageMap(storageMap)); + + kernels.push_back(code_obj); + } + + nextDirPtr = getCodeSectionEntry(p->nextModuleEntry); + } + break; + + case BRIG_KIND_DIRECTIVE_VARIABLE: + { + const BrigDirectiveVariable *p = + reinterpret_cast(dirPtr); + + uint64_t readonlySize_old = + storageMap->getSize(BRIG_SEGMENT_READONLY); + + StorageElement* se = storageMap->addSymbol(p, this); + + DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n", + getString(p->name)); + + if (p->segment == BRIG_SEGMENT_READONLY) { + // readonly memory has initialization data + uint8_t* readonlyData_old = readonlyData; + + readonlyData = + new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)]; + + if (p->init) { + if ((p->type == BRIG_TYPE_ROIMG) || + (p->type == BRIG_TYPE_WOIMG) || + (p->type == BRIG_TYPE_SAMP) || + (p->type == BRIG_TYPE_SIG32) || + (p->type == BRIG_TYPE_SIG64)) { + panic("Read only data type not supported: %s\n", + getString(p->name)); + } + + const BrigOperand *brigOp = getOperand(p->init); + assert(brigOp->kind == + BRIG_KIND_OPERAND_CONSTANT_BYTES); + + const Brig::BrigData *operand_data M5_VAR_USED = + getBrigBaseData(((BrigOperandConstantBytes*) + brigOp)->bytes); + + assert((operand_data->byteCount / 4) > 0); + + uint8_t *symbol_data = + (uint8_t*)getData(((BrigOperandConstantBytes*) + brigOp)->bytes + 4); + + // copy the old data and add the new data + if (readonlySize_old > 0) { + memcpy(readonlyData, readonlyData_old, + readonlySize_old); + } + + memcpy(readonlyData + se->offset, symbol_data, + se->size); + + delete[] readonlyData_old; + } + } + } + break; + + case BRIG_KIND_DIRECTIVE_LABEL: + { + const BrigDirectiveLabel M5_VAR_USED *p = + reinterpret_cast(dirPtr); + + panic("Label directives cannot be at the module level: %s\n", + getString(p->name)); + + } + break; + + case BRIG_KIND_DIRECTIVE_COMMENT: + { + const BrigDirectiveComment M5_VAR_USED *p = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n", + getString(p->name)); + } + break; + + case BRIG_KIND_DIRECTIVE_LOC: + { + DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_MODULE: + { + const BrigDirectiveModule M5_VAR_USED *p = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n", + getString(p->name)); + } + break; + + case BRIG_KIND_DIRECTIVE_CONTROL: + { + DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_PRAGMA: + { + DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_EXTENSION: + { + DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START: + { + DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END: + { + DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n"); + } + break; + default: + if (dirPtr->kind >= BRIG_KIND_INST_BEGIN && + dirPtr->kind <= BRIG_KIND_INST_END) + break; + + if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN && + dirPtr->kind <= BRIG_KIND_OPERAND_END) + break; + + warn("Unknown Brig directive kind: %d\n", dirPtr->kind); + break; + } + + dirPtr = nextDirPtr; + } +} + +HsaObject* +BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData) +{ + const char *brig_ident = "HSA BRIG"; + + if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH)) + return nullptr; + + return new BrigObject(fname, len, fileData); +} + +BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData) + : HsaObject(fname), storageMap(new StorageMap()) +{ + const char *brig_ident = "HSA BRIG"; + BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData; + + fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH), + "%s is not a BRIG file\n", fname); + + if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR || + mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) { + fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n", + fname, mod_hdr->brigMajor, mod_hdr->brigMinor, + BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR); + } + + fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section " + "count (%d) != expected value (%d)\n", fname, + mod_hdr->sectionCount, NumSectionIndices); + + for (int i = 0; i < NumSectionIndices; ++i) { + sectionInfo[i].ptr = nullptr; + } + + uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex); + for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) { + uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx]; + BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr; + + // It doesn't look like cprintf supports string precision values, + // but if this breaks, the right answer is to fix that + DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength, + sec_hdr->name); + + sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount]; + memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount); + sectionInfo[sec_idx].size = sec_hdr->byteCount; + } + + BrigSectionHeader *code_hdr = + (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr; + + DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, " + "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount, + code_hdr->nameLength); + + // start at offset 4 to skip initial null entry (see Brig spec) + processDirectives(getCodeSectionEntry(code_hdr->headerByteCount), + getCodeSectionEntry(sectionInfo[CodeSectionIndex].size), + storageMap); + + delete[] fileData; + + DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname); +} + +BrigObject::~BrigObject() +{ + for (int i = 0; i < NumSectionIndices; ++i) + if (sectionInfo[i].ptr) + delete[] sectionInfo[i].ptr; +} diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh new file mode 100644 index 000000000..59a585914 --- /dev/null +++ b/src/gpu-compute/brig_object.hh @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt, Anthony Gutierrez + */ + +#ifndef __BRIG_OBJECT_HH__ +#define __BRIG_OBJECT_HH__ + +#include +#include +#include +#include + +#include "arch/hsail/Brig.h" +#include "gpu-compute/hsa_object.hh" +#include "gpu-compute/hsail_code.hh" + +class LabelMap; +class StorageMap; + +/* @class BrigObject + * this class implements the BRIG loader object, and + * is used when the simulator directly executes HSAIL. + * this class is responsible for extracting all + * information about kernels contained in BRIG format + * and converts them to HsailCode objects that are + * usable by the simulator and emulated runtime. + */ + +class BrigObject final : public HsaObject +{ + public: + enum SectionIndex + { + DataSectionIndex, + CodeSectionIndex, + OperandsSectionIndex, + NumSectionIndices + }; + + static const char *sectionNames[]; + + struct SectionInfo + { + uint8_t *ptr; + int size; + }; + + static HsaObject* tryFile(const std::string &fname, int len, + uint8_t *fileData); + + SectionInfo sectionInfo[NumSectionIndices]; + const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const; + + std::vector kernels; + std::vector functions; + std::string kern_block_name; + + void processDirectives(const Brig::BrigBase *dirPtr, + const Brig::BrigBase *endPtr, + StorageMap *storageMap); + + BrigObject(const std::string &fname, int len, uint8_t *fileData); + ~BrigObject(); + + // eventually these will need to be per-kernel not per-object-file + StorageMap *storageMap; + LabelMap *labelMap; + + const char* getString(int offs) const; + const Brig::BrigData* getBrigBaseData(int offs) const; + const uint8_t* getData(int offs) const; + const Brig::BrigBase* getCodeSectionEntry(int offs) const; + const Brig::BrigOperand* getOperand(int offs) const; + unsigned getOperandPtr(int offs, int index) const; + const Brig::BrigInstBase* getInst(int offs) const; + + HsaCode* getKernel(const std::string &name) const override; + HsaCode* getFunction(const std::string &name) const override; + + int numKernels() const override { return kernels.size(); } + + HsaCode* getKernel(int i) const override { return kernels[i]; } + + // pointer to the current kernel/function we're processing, so elements + // under construction can reference it. kinda ugly, but easier + // than passing it all over for the few places it's needed. + mutable HsailCode *currentCode; +}; + +// Utility function to bump Brig item pointer to next element given +// item size in bytes. Really just an add but with lots of casting. +template +T* +brigNext(T *ptr) +{ + Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr; + int size = base_ptr->byteCount; + assert(size); + + return (T*)((uint8_t*)ptr + size); +} + +#endif // __BRIG_OBJECT_HH__ diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc new file mode 100644 index 000000000..3b3291c03 --- /dev/null +++ b/src/gpu-compute/cl_driver.cc @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/cl_driver.hh" + +#include "base/intmath.hh" +#include "cpu/thread_context.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/hsa_code.hh" +#include "gpu-compute/hsa_kernel_info.hh" +#include "gpu-compute/hsa_object.hh" +#include "params/ClDriver.hh" +#include "sim/process.hh" +#include "sim/syscall_emul_buf.hh" + +ClDriver::ClDriver(ClDriverParams *p) + : EmulatedDriver(p), hsaCode(0) +{ + for (const auto &codeFile : p->codefile) + codeFiles.push_back(&codeFile); + + maxFuncArgsSize = 0; + + for (int i = 0; i < codeFiles.size(); ++i) { + HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]); + + for (int k = 0; k < obj->numKernels(); ++k) { + assert(obj->getKernel(k)); + kernels.push_back(obj->getKernel(k)); + kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData); + int kern_funcargs_size = kernels.back()->funcarg_size; + maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ? + kern_funcargs_size : maxFuncArgsSize; + } + } + + int name_offs = 0; + int code_offs = 0; + + for (int i = 0; i < kernels.size(); ++i) { + kernelInfo.push_back(HsaKernelInfo()); + HsaCode *k = kernels[i]; + + k->generateHsaKernelInfo(&kernelInfo[i]); + + kernelInfo[i].name_offs = name_offs; + kernelInfo[i].code_offs = code_offs; + + name_offs += k->name().size() + 1; + code_offs += k->numInsts() * sizeof(GPUStaticInst*); + } +} + +void +ClDriver::handshake(GpuDispatcher *_dispatcher) +{ + dispatcher = _dispatcher; + dispatcher->setFuncargsSize(maxFuncArgsSize); +} + +int +ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags) +{ + int fd = p->allocFD(-1, filename, 0, 0, false); + FDEntry *fde = p->getFDEntry(fd); + fde->driver = this; + + return fd; +} + +int +ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req) +{ + int index = 2; + Addr buf_addr = process->getSyscallArg(tc, index); + + switch (req) { + case HSA_GET_SIZES: + { + TypedBufferArg sizes(buf_addr); + sizes->num_kernels = kernels.size(); + sizes->string_table_size = 0; + sizes->code_size = 0; + sizes->readonly_size = 0; + + if (kernels.size() > 0) { + // all kernels will share the same read-only memory + sizes->readonly_size = + kernels[0]->getSize(HsaCode::MemorySegment::READONLY); + // check our assumption + for (int i = 1; ireadonly_size == + kernels[i]->getSize(HsaCode::MemorySegment::READONLY)); + } + } + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + // add one for terminating '\0' + sizes->string_table_size += k->name().size() + 1; + sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*); + } + + sizes.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_KINFO: + { + TypedBufferArg + kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size()); + + for (int i = 0; i < kernels.size(); ++i) { + HsaKernelInfo *ki = &kinfo[i]; + ki->name_offs = kernelInfo[i].name_offs; + ki->code_offs = kernelInfo[i].code_offs; + ki->sRegCount = kernelInfo[i].sRegCount; + ki->dRegCount = kernelInfo[i].dRegCount; + ki->cRegCount = kernelInfo[i].cRegCount; + ki->static_lds_size = kernelInfo[i].static_lds_size; + ki->private_mem_size = kernelInfo[i].private_mem_size; + ki->spill_mem_size = kernelInfo[i].spill_mem_size; + } + + kinfo.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_STRINGS: + { + int string_table_size = 0; + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + string_table_size += k->name().size() + 1; + } + + BufferArg buf(buf_addr, string_table_size); + char *bufp = (char*)buf.bufferPtr(); + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + const char *n = k->name().c_str(); + + // idiomatic string copy + while ((*bufp++ = *n++)); + } + + assert(bufp - (char *)buf.bufferPtr() == string_table_size); + + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_READONLY_DATA: + { + // we can pick any kernel --- they share the same + // readonly segment (this assumption is checked in GET_SIZES) + uint64_t size = + kernels.back()->getSize(HsaCode::MemorySegment::READONLY); + BufferArg data(buf_addr, size); + char *datap = (char *)data.bufferPtr(); + memcpy(datap, + kernels.back()->readonly_data, + size); + data.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_CODE: + { + // set hsaCode pointer + hsaCode = buf_addr; + int code_size = 0; + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst); + } + + TypedBufferArg buf(buf_addr, code_size); + TheGpuISA::RawMachInst *bufp = buf; + + int buf_idx = 0; + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + + for (int j = 0; j < k->numInsts(); ++j) { + bufp[buf_idx] = k->insts()->at(j); + ++buf_idx; + } + } + + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_CU_CNT: + { + BufferArg buf(buf_addr, sizeof(uint32_t)); + *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs(); + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_VSZ: + { + BufferArg buf(buf_addr, sizeof(uint32_t)); + *((uint32_t*)buf.bufferPtr()) = VSZ; + buf.copyOut(tc->getMemProxy()); + } + break; + + default: + fatal("ClDriver: bad ioctl %d\n", req); + } + + return 0; +} + +const char* +ClDriver::codeOffToKernelName(uint64_t code_ptr) +{ + assert(hsaCode); + uint32_t code_offs = code_ptr - hsaCode; + + for (int i = 0; i < kernels.size(); ++i) { + if (code_offs == kernelInfo[i].code_offs) { + return kernels[i]->name().c_str(); + } + } + + return nullptr; +} + +ClDriver* +ClDriverParams::create() +{ + return new ClDriver(this); +} diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh new file mode 100644 index 000000000..03567bab5 --- /dev/null +++ b/src/gpu-compute/cl_driver.hh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __CL_DRIVER_HH__ +#define __CL_DRIVER_HH__ + +#include + +#include "gpu-compute/hsa_kernel_info.hh" +#include "sim/emul_driver.hh" + +class GpuDispatcher; +class HsaCode; +class LiveProcess; +class ThreadContext; + +struct ClDriverParams; + +class ClDriver final : public EmulatedDriver +{ + public: + ClDriver(ClDriverParams *p); + void handshake(GpuDispatcher *_dispatcher); + int open(LiveProcess *p, ThreadContext *tc, int mode, int flags); + int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req); + const char* codeOffToKernelName(uint64_t code_ptr); + + private: + GpuDispatcher *dispatcher; + + std::vector codeFiles; + + // All the kernels we know about + std::vector kernels; + std::vector functions; + + std::vector kernelInfo; + + // maximum size necessary for function arguments + int maxFuncArgsSize; + // The host virtual address for the kernel code + uint64_t hsaCode; +}; + +#endif // __CL_DRIVER_HH__ diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh new file mode 100644 index 000000000..75297a2d2 --- /dev/null +++ b/src/gpu-compute/cl_event.hh @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Marc Orr + */ + +#ifndef __GPU_CL_EVENT_HH__ +#define __GPU_CL_EVENT_HH__ + +struct HsaQueueEntry; + +class _cl_event { + public: + _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { } + + volatile bool done; + HsaQueueEntry *hsaTaskPtr; + uint64_t start; + uint64_t end; +}; + +#endif // __GPU_CL_EVENT_HH__ diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh new file mode 100644 index 000000000..126cf6c50 --- /dev/null +++ b/src/gpu-compute/code_enums.hh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __CODE_ENUMS_HH__ +#define __CODE_ENUMS_HH__ + +#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ + && (a)<=Enums::OT_GLOBAL_LDAS) +#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ + && (a)<=Enums::OT_SHARED_LDAS) +#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ + && (a)<=Enums::OT_PRIVATE_LDAS) +#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ + && (a)<=Enums::OT_SPILL_LDAS) +#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ + && (a)<=Enums::OT_READONLY_LDAS) +#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) + +#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ + ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ + ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) + +#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ + ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ + ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) + +#define IS_OT_READ_GM(a) \ + ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ + ||(a)==Enums::OT_READONLY_READ) + +#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) + +#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) + +#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) + +#define IS_OT_WRITE(a) \ + ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ + ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ + ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) + +#define IS_OT_WRITE_GM(a) \ + ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ + ||(a)==Enums::OT_READONLY_WRITE) + +#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) + +#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) + +#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ + ||(a)==Enums::OT_SHARED_ATOMIC \ + ||(a)==Enums::OT_PRIVATE_ATOMIC \ + ||(a)==Enums::OT_SPILL_ATOMIC \ + ||(a)==Enums::OT_READONLY_ATOMIC \ + ||(a)==Enums::OT_FLAT_ATOMIC) + +#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ + ||(a)==Enums::OT_SPILL_ATOMIC \ + ||(a)==Enums::OT_READONLY_ATOMIC \ + ||(a)==Enums::OT_GLOBAL_MEMFENCE \ + ||(a)==Enums::OT_BOTH_MEMFENCE) + +#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ + ||(a)==Enums::OT_SHARED_MEMFENCE \ + ||(a)==Enums::OT_BOTH_MEMFENCE) + +#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) + +#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ + ||(a)==Enums::OT_SHARED_HIST \ + ||(a)==Enums::OT_PRIVATE_HIST \ + ||(a)==Enums::OT_SPILL_HIST \ + ||(a)==Enums::OT_READONLY_HIST \ + ||(a)==Enums::OT_FLAT_HIST) + +#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ + ||(a)==Enums::OT_SPILL_HIST \ + ||(a)==Enums::OT_READONLY_HIST) + +#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) + +#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) + +#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc new file mode 100644 index 000000000..d3622007a --- /dev/null +++ b/src/gpu-compute/compute_unit.cc @@ -0,0 +1,1817 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#include "gpu-compute/compute_unit.hh" + +#include "base/output.hh" +#include "debug/GPUDisp.hh" +#include "debug/GPUExec.hh" +#include "debug/GPUFetch.hh" +#include "debug/GPUMem.hh" +#include "debug/GPUPort.hh" +#include "debug/GPUPrefetch.hh" +#include "debug/GPUSync.hh" +#include "debug/GPUTLB.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/ndrange.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/page_table.hh" +#include "sim/process.hh" + +ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), + scoreboardCheckStage(p), scheduleStage(p), execStage(p), + globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0), + cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs), + spBypassPipeLength(p->spbypass_pipe_length), + dpBypassPipeLength(p->dpbypass_pipe_length), + issuePeriod(p->issue_period), + numGlbMemUnits(p->num_global_mem_pipes), + numLocMemUnits(p->num_shared_mem_pipes), + perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth), + prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type), + xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault), + functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier), + countPages(p->countPages), barrier_id(0), + vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), + coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), + req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), + resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), + _masterId(p->system->getMasterId(name() + ".ComputeUnit")), + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) +{ + // this check will be eliminated once we have wavefront size support added + fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ"); + // calculate how many cycles a vector load or store will need to transfer + // its data over the corresponding buses + numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t)) + / (double)vrfToCoalescerBusWidth); + + numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t)) + / coalescerToVrfBusWidth; + + lastVaddrWF.resize(numSIMDs); + wfList.resize(numSIMDs); + + for (int j = 0; j < numSIMDs; ++j) { + lastVaddrWF[j].resize(p->n_wf); + + for (int i = 0; i < p->n_wf; ++i) { + lastVaddrWF[j][i].resize(VSZ); + + wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); + wfList[j][i]->setParent(this); + + for (int k = 0; k < VSZ; ++k) { + lastVaddrWF[j][i][k] = 0; + } + } + } + + lastVaddrPhase.resize(numSIMDs); + + for (int i = 0; i < numSIMDs; ++i) { + lastVaddrPhase[i] = LastVaddrWave(); + } + + lastVaddrCU = LastVaddrWave(); + + lds.setParent(this); + + if (p->execPolicy == "OLDEST-FIRST") { + exec_policy = EXEC_POLICY::OLDEST; + } else if (p->execPolicy == "ROUND-ROBIN") { + exec_policy = EXEC_POLICY::RR; + } else { + fatal("Invalid WF execution policy (CU)\n"); + } + + memPort.resize(VSZ); + + // resize the tlbPort vectorArray + int tlbPort_width = perLaneTLB ? VSZ : 1; + tlbPort.resize(tlbPort_width); + + cuExitCallback = new CUExitCallback(this); + registerExitCallback(cuExitCallback); + + xactCasLoadMap.clear(); + lastExecCycle.resize(numSIMDs, 0); + + for (int i = 0; i < vrf.size(); ++i) { + vrf[i]->setParent(this); + } + + numVecRegsPerSimd = vrf[0]->numRegs(); +} + +ComputeUnit::~ComputeUnit() +{ + // Delete wavefront slots + + for (int j = 0; j < numSIMDs; ++j) + for (int i = 0; i < shader->n_wf; ++i) { + delete wfList[j][i]; + } + + readyList.clear(); + waveStatusList.clear(); + dispatchList.clear(); + vectorAluInstAvail.clear(); + delete cuExitCallback; + delete ldsPort; +} + +void +ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr) +{ + w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount); + + w->workgroupsz[0] = ndr->q.wgSize[0]; + w->workgroupsz[1] = ndr->q.wgSize[1]; + w->workgroupsz[2] = ndr->q.wgSize[2]; + w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2]; + w->gridsz[0] = ndr->q.gdSize[0]; + w->gridsz[1] = ndr->q.gdSize[1]; + w->gridsz[2] = ndr->q.gdSize[2]; + w->kernelArgs = ndr->q.args; + w->privSizePerItem = ndr->q.privMemPerItem; + w->spillSizePerItem = ndr->q.spillMemPerItem; + w->roBase = ndr->q.roMemStart; + w->roSize = ndr->q.roMemTotal; +} + +void +ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart) +{ + wfCtx->cnt = cnt; + + VectorMask init_mask; + init_mask.reset(); + + for (int k = 0; k < VSZ; ++k) { + if (k + cnt * VSZ < trueWgSizeTotal) + init_mask[k] = 1; + } + + wfCtx->init_mask = init_mask.to_ullong(); + wfCtx->exec_mask = init_mask.to_ullong(); + + for (int i = 0; i < VSZ; ++i) { + wfCtx->bar_cnt[i] = 0; + } + + wfCtx->max_bar_cnt = 0; + wfCtx->old_barrier_cnt = 0; + wfCtx->barrier_cnt = 0; + + wfCtx->privBase = ndr->q.privMemStart; + ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ; + + wfCtx->spillBase = ndr->q.spillMemStart; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ; + + wfCtx->pc = 0; + wfCtx->rpc = UINT32_MAX; + + // set the wavefront context to have a pointer to this section of the LDS + wfCtx->ldsChunk = ldsChunk; + + // WG state + wfCtx->wg_id = ndr->globalWgId; + wfCtx->barrier_id = barrier_id; + + // Kernel wide state + wfCtx->ndr = ndr; +} + +void +ComputeUnit::updateEvents() { + + if (!timestampVec.empty()) { + uint32_t vecSize = timestampVec.size(); + uint32_t i = 0; + while (i < vecSize) { + if (timestampVec[i] <= shader->tick_cnt) { + std::pair regInfo = regIdxVec[i]; + vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t), + statusVec[i]); + timestampVec.erase(timestampVec.begin() + i); + regIdxVec.erase(regIdxVec.begin() + i); + statusVec.erase(statusVec.begin() + i); + --vecSize; + --i; + } + ++i; + } + } + + for (int i = 0; i< numSIMDs; ++i) { + vrf[i]->updateEvents(); + } +} + + +void +ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal) +{ + static int _n_wave = 0; + int cnt = wfCtx->cnt; + NDRange *ndr = wfCtx->ndr; + + // Fill in Kernel state + FillKernelState(w, ndr); + + w->kern_id = ndr->dispatchId; + w->dynwaveid = cnt; + w->init_mask = wfCtx->init_mask; + + for (int k = 0; k < VSZ; ++k) { + w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0]; + w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1]; + w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]); + + w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * + trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + + w->workitemid[0][k]; + } + + w->old_barrier_cnt = wfCtx->old_barrier_cnt; + w->barrier_cnt = wfCtx->barrier_cnt; + w->barrier_slots = divCeil(trueWgSizeTotal, VSZ); + + for (int i = 0; i < VSZ; ++i) { + w->bar_cnt[i] = wfCtx->bar_cnt[i]; + } + + w->max_bar_cnt = wfCtx->max_bar_cnt; + w->privBase = wfCtx->privBase; + w->spillBase = wfCtx->spillBase; + + w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask); + + // WG state + w->wg_id = wfCtx->wg_id; + w->dispatchid = wfCtx->ndr->dispatchId; + w->workgroupid[0] = w->wg_id % ndr->numWg[0]; + w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1]; + w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]); + + w->barrier_id = wfCtx->barrier_id; + w->stalledAtBarrier = false; + + // move this from the context into the actual wavefront + w->ldsChunk = wfCtx->ldsChunk; + + int32_t refCount M5_VAR_USED = + lds.increaseRefCounter(w->dispatchid, w->wg_id); + DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", + cu_id, w->wg_id, refCount); + + w->instructionBuffer.clear(); + + if (w->pendingFetch) + w->dropFetch = true; + + // is this the last wavefront in the workgroup + // if set the spillWidth to be the remaining work-items + // so that the vector access is correct + if ((cnt + 1) * VSZ >= trueWgSizeTotal) { + w->spillWidth = trueWgSizeTotal - (cnt * VSZ); + } else { + w->spillWidth = VSZ; + } + + DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " + "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); + + w->start(++_n_wave, ndr->q.code_ptr); +} + +void +ComputeUnit::StartWorkgroup(NDRange *ndr) +{ + // reserve the LDS capacity allocated to the work group + // disambiguated by the dispatch ID and workgroup ID, which should be + // globally unique + LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId, + ndr->q.ldsSize); + + // Send L1 cache acquire + // isKernel + isAcquire = Kernel Begin + if (shader->impl_kern_boundary_sync) { + GPUDynInstPtr gpuDynInst = std::make_shared(nullptr, + nullptr, + nullptr, 0); + + gpuDynInst->useContinuation = false; + gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; + gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; + injectGlobalMemFence(gpuDynInst, true); + } + + // Get true size of workgroup (after clamping to grid size) + int trueWgSize[3]; + int trueWgSizeTotal = 1; + + for (int d = 0; d < 3; ++d) { + trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - + ndr->wgId[d] * ndr->q.wgSize[d]); + + trueWgSizeTotal *= trueWgSize[d]; + } + + uint64_t origSpillMemStart = ndr->q.spillMemStart; + // calculate the number of 32-bit vector registers required by wavefront + int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); + int cnt = 0; + + // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time + for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { + Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; + // Check if this wavefront slot is available: + // It must be stopped and not waiting + // for a release to complete S_RETURNING + if (w->status == Wavefront::S_STOPPED) { + // if we have scheduled all work items then stop + // scheduling wavefronts + if (cnt * VSZ >= trueWgSizeTotal) + break; + + // reserve vector registers for the scheduled wavefront + assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd); + uint32_t normSize = 0; + + w->startVgprIndex = vrf[m % numSIMDs]->manager-> + allocateRegion(vregDemand, &normSize); + + w->reservedVectorRegs = normSize; + vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; + + WFContext wfCtx; + + InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal, + ldsChunk, origSpillMemStart); + + StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal); + ++cnt; + } + } + ++barrier_id; +} + +int +ComputeUnit::ReadyWorkgroup(NDRange *ndr) +{ + // Get true size of workgroup (after clamping to grid size) + int trueWgSize[3]; + int trueWgSizeTotal = 1; + + for (int d = 0; d < 3; ++d) { + trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - + ndr->wgId[d] * ndr->q.wgSize[d]); + + trueWgSizeTotal *= trueWgSize[d]; + DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]); + } + + DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal); + + // calculate the number of 32-bit vector registers required by each + // work item of the work group + int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); + bool vregAvail = true; + int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ; + int freeWfSlots = 0; + // check if the total number of VGPRs required by all WFs of the WG + // fit in the VRFs of all SIMD units + assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd)); + int numMappedWfs = 0; + std::vector numWfsPerSimd; + numWfsPerSimd.resize(numSIMDs, 0); + // find how many free WF slots we have across all SIMDs + for (int j = 0; j < shader->n_wf; ++j) { + for (int i = 0; i < numSIMDs; ++i) { + if (wfList[i][j]->status == Wavefront::S_STOPPED) { + // count the number of free WF slots + ++freeWfSlots; + if (numMappedWfs < numWfs) { + // count the WFs to be assigned per SIMD + numWfsPerSimd[i]++; + } + numMappedWfs++; + } + } + } + + // if there are enough free WF slots then find if there are enough + // free VGPRs per SIMD based on the WF->SIMD mapping + if (freeWfSlots >= numWfs) { + for (int j = 0; j < numSIMDs; ++j) { + // find if there are enough free VGPR regions in the SIMD's VRF + // to accommodate the WFs of the new WG that would be mapped to + // this SIMD unit + vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j], + vregDemandPerWI); + + // stop searching if there is at least one SIMD + // whose VRF does not have enough free VGPR pools. + // This is because a WG is scheduled only if ALL + // of its WFs can be scheduled + if (!vregAvail) + break; + } + } + + DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n", + freeWfSlots, vregAvail); + + if (!vregAvail) { + ++numTimesWgBlockedDueVgprAlloc; + } + + // Return true if enough WF slots to submit workgroup and if there are + // enough VGPRs to schedule all WFs to their SIMD units + if (!lds.canReserve(ndr->q.ldsSize)) { + wgBlockedDueLdsAllocation++; + } + + // Return true if (a) there are enough free WF slots to submit + // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their + // SIMD units and (c) if there is enough space in LDS + return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize); +} + +int +ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots) +{ + DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id); + int ccnt = 0; + + for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) { + for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) { + Wavefront *w = wfList[i_simd][i_wf]; + + if (w->status == Wavefront::S_RUNNING) { + DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf); + + DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n", + w->barrier_id, _barrier_id); + + DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n", + w->barrier_cnt, bcnt); + } + + if (w->status == Wavefront::S_RUNNING && + w->barrier_id == _barrier_id && w->barrier_cnt == bcnt && + !w->outstanding_reqs) { + ++ccnt; + + DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to " + "%d\n", i_simd, i_wf, ccnt); + } + } + } + + DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n", + cu_id, ccnt, bslots); + + return ccnt == bslots; +} + +// Check if the current wavefront is blocked on additional resources. +bool +ComputeUnit::cedeSIMD(int simdId, int wfSlotId) +{ + bool cede = false; + + // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld + // magic instructions will impact the scheduling of wavefronts + if (xact_cas_mode) { + /* + * When a wavefront calls xact_cas_ld, it adds itself to a per address + * queue. All per address queues are managed by the xactCasLoadMap. + * + * A wavefront is not blocked if: it is not in ANY per address queue or + * if it is at the head of a per address queue. + */ + for (auto itMap : xactCasLoadMap) { + std::list curWaveIDQueue = itMap.second.waveIDQueue; + + if (!curWaveIDQueue.empty()) { + for (auto it : curWaveIDQueue) { + waveIdentifier cur_wave = it; + + if (cur_wave.simdId == simdId && + cur_wave.wfSlotId == wfSlotId) { + // 2 possibilities + // 1: this WF has a green light + // 2: another WF has a green light + waveIdentifier owner_wave = curWaveIDQueue.front(); + + if (owner_wave.simdId != cur_wave.simdId || + owner_wave.wfSlotId != cur_wave.wfSlotId) { + // possibility 2 + cede = true; + break; + } else { + // possibility 1 + break; + } + } + } + } + } + } + + return cede; +} + +// Execute one clock worth of work on the ComputeUnit. +void +ComputeUnit::exec() +{ + updateEvents(); + // Execute pipeline stages in reverse order to simulate + // the pipeline latency + globalMemoryPipe.exec(); + localMemoryPipe.exec(); + execStage.exec(); + scheduleStage.exec(); + scoreboardCheckStage.exec(); + fetchStage.exec(); + + totalCycles++; +} + +void +ComputeUnit::init() +{ + // Initialize CU Bus models + glbMemToVrfBus.init(&shader->tick_cnt, 1); + locMemToVrfBus.init(&shader->tick_cnt, 1); + nextGlbMemBus = 0; + nextLocMemBus = 0; + fatal_if(numGlbMemUnits > 1, + "No support for multiple Global Memory Pipelines exists!!!"); + vrfToGlobalMemPipeBus.resize(numGlbMemUnits); + for (int j = 0; j < numGlbMemUnits; ++j) { + vrfToGlobalMemPipeBus[j] = WaitClass(); + vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1); + } + + fatal_if(numLocMemUnits > 1, + "No support for multiple Local Memory Pipelines exists!!!"); + vrfToLocalMemPipeBus.resize(numLocMemUnits); + for (int j = 0; j < numLocMemUnits; ++j) { + vrfToLocalMemPipeBus[j] = WaitClass(); + vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1); + } + vectorRegsReserved.resize(numSIMDs, 0); + aluPipe.resize(numSIMDs); + wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits); + + for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) { + wfWait[i] = WaitClass(); + wfWait[i].init(&shader->tick_cnt, 1); + } + + for (int i = 0; i < numSIMDs; ++i) { + aluPipe[i] = WaitClass(); + aluPipe[i].init(&shader->tick_cnt, 1); + } + + // Setup space for call args + for (int j = 0; j < numSIMDs; ++j) { + for (int i = 0; i < shader->n_wf; ++i) { + wfList[j][i]->initCallArgMem(shader->funcargs_size); + } + } + + // Initializing pipeline resources + readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits); + waveStatusList.resize(numSIMDs); + + for (int j = 0; j < numSIMDs; ++j) { + for (int i = 0; i < shader->n_wf; ++i) { + waveStatusList[j].push_back( + std::make_pair(wfList[j][i], BLOCKED)); + } + } + + for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) { + dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY)); + } + + fetchStage.init(this); + scoreboardCheckStage.init(this); + scheduleStage.init(this); + execStage.init(this); + globalMemoryPipe.init(this); + localMemoryPipe.init(this); + // initialize state for statistics calculation + vectorAluInstAvail.resize(numSIMDs, false); + shrMemInstAvail = 0; + glbMemInstAvail = 0; +} + +bool +ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) +{ + // Ruby has completed the memory op. Schedule the mem_resp_event at the + // appropriate cycle to process the timing memory response + // This delay represents the pipeline delay + SenderState *sender_state = safe_cast(pkt->senderState); + int index = sender_state->port_index; + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + + // Is the packet returned a Kernel End or Barrier + if (pkt->req->isKernel() && pkt->req->isRelease()) { + Wavefront *w = + computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; + + // Check if we are waiting on Kernel End Release + if (w->status == Wavefront::S_RETURNING) { + DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n", + computeUnit->cu_id, w->simdId, w->wfSlotId, + w->wfDynId, w->kern_id); + + computeUnit->shader->dispatcher->notifyWgCompl(w); + w->status = Wavefront::S_STOPPED; + } else { + w->outstanding_reqs--; + } + + DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, w->barrier_cnt); + + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; + return true; + } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; + return true; + } + + ComputeUnit::DataPort::MemRespEvent *mem_resp_event = + new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index], + pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n", + computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + index, pkt->req->getPaddr()); + + computeUnit->schedule(mem_resp_event, + curTick() + computeUnit->resp_tick_latency); + return true; +} + +void +ComputeUnit::DataPort::recvReqRetry() +{ + int len = retries.size(); + + assert(len > 0); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front().first; + GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second; + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n", + computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + pkt->req->getPaddr()); + + /** Currently Ruby can return false due to conflicts for the particular + * cache block or address. Thus other requests should be allowed to + * pass and the data port should expect multiple retries. */ + if (!sendTimingReq(pkt)) { + DPRINTF(GPUMem, "failed again!\n"); + break; + } else { + DPRINTF(GPUMem, "successful!\n"); + retries.pop_front(); + } + } +} + +bool +ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) +{ + computeUnit->fetchStage.processFetchReturn(pkt); + + return true; +} + +void +ComputeUnit::SQCPort::recvReqRetry() +{ + int len = retries.size(); + + assert(len > 0); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front().first; + Wavefront *wavefront M5_VAR_USED = retries.front().second; + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + if (!sendTimingReq(pkt)) { + DPRINTF(GPUFetch, "failed again!\n"); + break; + } else { + DPRINTF(GPUFetch, "successful!\n"); + retries.pop_front(); + } + } +} + +void +ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +{ + // There must be a way around this check to do the globalMemStart... + Addr tmp_vaddr = pkt->req->getVaddr(); + + updatePageDivergenceDist(tmp_vaddr); + + pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(), + pkt->req->getFlags(), pkt->req->masterId(), + pkt->req->getPC()); + + // figure out the type of the request to set read/write + BaseTLB::Mode TLB_mode; + assert(pkt->isRead() || pkt->isWrite()); + + // Check write before read for atomic operations + // since atomic operations should use BaseTLB::Write + if (pkt->isWrite()){ + TLB_mode = BaseTLB::Write; + } else if (pkt->isRead()) { + TLB_mode = BaseTLB::Read; + } else { + fatal("pkt is not a read nor a write\n"); + } + + tlbCycles -= curTick(); + ++tlbRequests; + + int tlbPort_index = perLaneTLB ? index : 0; + + if (shader->timingSim) { + if (debugSegFault) { + Process *p = shader->gpuTc->getProcessPtr(); + Addr vaddr = pkt->req->getVaddr(); + unsigned size = pkt->getSize(); + + if ((vaddr + size - 1) % 64 < vaddr % 64) { + panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); + } + + Addr paddr; + + if (!p->pTable->translate(vaddr, paddr)) { + if (!p->fixupStackFault(vaddr)) { + panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + vaddr); + } + } + } + + // This is the SenderState needed upon return + pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index); + + // This is the senderState needed by the TLB hierarchy to function + TheISA::GpuTLB::TranslationState *translation_state = + new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false, + pkt->senderState); + + pkt->senderState = translation_state; + + if (functionalTLB) { + tlbPort[tlbPort_index]->sendFunctional(pkt); + + // update the hitLevel distribution + int hit_level = translation_state->hitLevel; + assert(hit_level != -1); + hitsPerTLBLevel[hit_level]++; + + // New SenderState for the memory access + X86ISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete sender_state->saved; + delete sender_state; + + assert(pkt->req->hasPaddr()); + assert(pkt->req->hasSize()); + + uint8_t *tmpData = pkt->getPtr(); + + // this is necessary because the GPU TLB receives packets instead + // of requests. when the translation is complete, all relevent + // fields in the request will be populated, but not in the packet. + // here we create the new packet so we can set the size, addr, + // and proper flags. + PacketPtr oldPkt = pkt; + pkt = new Packet(oldPkt->req, oldPkt->cmd); + delete oldPkt; + pkt->dataStatic(tmpData); + + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, + index, nullptr); + + gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); + gpuDynInst->tlbHitLevel[index] = hit_level; + + + // translation is done. Schedule the mem_req_event at the + // appropriate cycle to send the timing memory request to ruby + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " + "scheduled\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); + } else if (tlbPort[tlbPort_index]->isStalled()) { + assert(tlbPort[tlbPort_index]->retries.size() > 0); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " + "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + tmp_vaddr); + + tlbPort[tlbPort_index]->retries.push_back(pkt); + } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) { + // Stall the data port; + // No more packet will be issued till + // ruby indicates resources are freed by + // a recvReqRetry() call back on this port. + tlbPort[tlbPort_index]->stallPort(); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " + "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + tmp_vaddr); + + tlbPort[tlbPort_index]->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, + "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); + } + } else { + if (pkt->cmd == MemCmd::MemFenceReq) { + gpuDynInst->statusBitVector = VectorMask(0); + } else { + gpuDynInst->statusBitVector &= (~(1ll << index)); + } + + // New SenderState for the memory access + delete pkt->senderState; + + // Because it's atomic operation, only need TLB translation state + pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, + shader->gpuTc); + + tlbPort[tlbPort_index]->sendFunctional(pkt); + + // the addr of the packet is not modified, so we need to create a new + // packet, or otherwise the memory access will have the old virtual + // address sent in the translation packet, instead of the physical + // address returned by the translation. + PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd); + new_pkt->dataStatic(pkt->getPtr()); + + // Translation is done. It is safe to send the packet to memory. + memPort[0]->sendFunctional(new_pkt); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, + gpuDynInst->simdId, gpuDynInst->wfSlotId, index, + new_pkt->req->getPaddr()); + + // safe_cast the senderState + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete new_pkt; + delete pkt->senderState; + delete pkt->req; + delete pkt; + } +} + +void +ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +{ + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index, + nullptr); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, + pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); +} + +void +ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, + Request* req) +{ + if (!req) { + req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1); + } + req->setPaddr(0); + if (kernelLaunch) { + req->setFlags(Request::KERNEL); + } + + gpuDynInst->s_type = SEG_GLOBAL; + + // for non-kernel MemFence operations, memorder flags are set depending + // on which type of request is currently being sent, so this + // should be set by the caller (e.g. if an inst has acq-rel + // semantics, it will send one acquire req an one release req) + gpuDynInst->setRequestFlags(req, kernelLaunch); + + // a mem fence must correspond to an acquire/release request + assert(req->isAcquire() || req->isRelease()); + + // create packet + PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq); + + // set packet's sender state + pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr); + + // send the packet + sendSyncRequest(gpuDynInst, 0, pkt); +} + +const char* +ComputeUnit::DataPort::MemRespEvent::description() const +{ + return "ComputeUnit memory response event"; +} + +void +ComputeUnit::DataPort::MemRespEvent::process() +{ + DataPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit = dataPort->computeUnit; + + assert(gpuDynInst); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", + compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + pkt->req->getPaddr(), dataPort->index); + + Addr paddr = pkt->req->getPaddr(); + + if (pkt->cmd != MemCmd::MemFenceResp) { + int index = gpuDynInst->memStatusVector[paddr].back(); + + DPRINTF(GPUMem, "Response for addr %#x, index %d\n", + pkt->req->getPaddr(), index); + + gpuDynInst->memStatusVector[paddr].pop_back(); + gpuDynInst->pAddr = pkt->req->getPaddr(); + + if (pkt->isRead() || pkt->isWrite()) { + + if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) { + gpuDynInst->statusBitVector &= (~(1ULL << index)); + } else { + assert(gpuDynInst->statusVector[index] > 0); + gpuDynInst->statusVector[index]--; + + if (!gpuDynInst->statusVector[index]) + gpuDynInst->statusBitVector &= (~(1ULL << index)); + } + + DPRINTF(GPUMem, "bitvector is now %#x\n", + gpuDynInst->statusBitVector); + + if (gpuDynInst->statusBitVector == VectorMask(0)) { + auto iter = gpuDynInst->memStatusVector.begin(); + auto end = gpuDynInst->memStatusVector.end(); + + while (iter != end) { + assert(iter->second.empty()); + ++iter; + } + + gpuDynInst->memStatusVector.clear(); + + if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + gpuDynInst->statusVector.clear(); + + if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) + || MO_ANR(gpuDynInst->m_op)) { + assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); + + compute_unit->globalMemoryPipe.getGMLdRespFIFO() + .push(gpuDynInst); + } else { + assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy()); + + compute_unit->globalMemoryPipe.getGMStRespFIFO() + .push(gpuDynInst); + } + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); + + // after clearing the status vectors, + // see if there is a continuation to perform + // the continuation may generate more work for + // this memory request + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + } + } + } else { + gpuDynInst->statusBitVector = VectorMask(0); + + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; +} + +ComputeUnit* +ComputeUnitParams::create() +{ + return new ComputeUnit(this); +} + +bool +ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) +{ + Addr line = pkt->req->getPaddr(); + + DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id, + pkt->req->getVaddr(), line); + + assert(pkt->senderState); + computeUnit->tlbCycles += curTick(); + + // pop off the TLB translation state + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast(pkt->senderState); + + // no PageFaults are permitted for data accesses + if (!translation_state->tlbEntry->valid) { + DTLBPort::SenderState *sender_state = + safe_cast(translation_state->saved); + + Wavefront *w M5_VAR_USED = + computeUnit->wfList[sender_state->_gpuDynInst->simdId] + [sender_state->_gpuDynInst->wfSlotId]; + + DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId, + pkt->req->getVaddr()); + } + + assert(translation_state->tlbEntry->valid); + + // update the hitLevel distribution + int hit_level = translation_state->hitLevel; + computeUnit->hitsPerTLBLevel[hit_level]++; + + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + pkt->senderState = translation_state->saved; + + // for prefetch pkt + BaseTLB::Mode TLB_mode = translation_state->tlbMode; + + delete translation_state; + + // use the original sender state to know how to close this transaction + DTLBPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + int mp_index = sender_state->portIndex; + Addr vaddr = pkt->req->getVaddr(); + gpuDynInst->memStatusVector[line].push_back(mp_index); + gpuDynInst->tlbHitLevel[mp_index] = hit_level; + + MemCmd requestCmd; + + if (pkt->cmd == MemCmd::ReadResp) { + requestCmd = MemCmd::ReadReq; + } else if (pkt->cmd == MemCmd::WriteResp) { + requestCmd = MemCmd::WriteReq; + } else if (pkt->cmd == MemCmd::SwapResp) { + requestCmd = MemCmd::SwapReq; + } else { + panic("unsupported response to request conversion %s\n", + pkt->cmd.toString()); + } + + if (computeUnit->prefetchDepth) { + int simdId = gpuDynInst->simdId; + int wfSlotId = gpuDynInst->wfSlotId; + Addr last = 0; + + switch(computeUnit->prefetchType) { + case Enums::PF_CU: + last = computeUnit->lastVaddrCU[mp_index]; + break; + case Enums::PF_PHASE: + last = computeUnit->lastVaddrPhase[simdId][mp_index]; + break; + case Enums::PF_WF: + last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; + default: + break; + } + + DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n", + computeUnit->cu_id, simdId, wfSlotId, mp_index, last); + + int stride = last ? (roundDown(vaddr, TheISA::PageBytes) - + roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift + : 0; + + DPRINTF(GPUPrefetch, "Stride is %d\n", stride); + + computeUnit->lastVaddrCU[mp_index] = vaddr; + computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr; + computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; + + stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? + computeUnit->prefetchStride: stride; + + DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr, + computeUnit->cu_id, simdId, wfSlotId, mp_index); + + DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr); + + // Prefetch Next few pages atomically + for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) { + DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride, + vaddr+stride*pf*TheISA::PageBytes); + + if (!stride) + break; + + Request *prefetch_req = new Request(0, vaddr + stride * pf * + TheISA::PageBytes, + sizeof(uint8_t), 0, + computeUnit->masterId(), + 0, 0, 0); + + PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd); + uint8_t foo = 0; + prefetch_pkt->dataStatic(&foo); + + // Because it's atomic operation, only need TLB translation state + prefetch_pkt->senderState = + new TheISA::GpuTLB::TranslationState(TLB_mode, + computeUnit->shader->gpuTc, + true); + + // Currently prefetches are zero-latency, hence the sendFunctional + sendFunctional(prefetch_pkt); + + /* safe_cast the senderState */ + TheISA::GpuTLB::TranslationState *tlb_state = + safe_cast( + prefetch_pkt->senderState); + + + delete tlb_state->tlbEntry; + delete tlb_state; + delete prefetch_pkt->req; + delete prefetch_pkt; + } + } + + // First we must convert the response cmd back to a request cmd so that + // the request can be sent through the cu's master port + PacketPtr new_pkt = new Packet(pkt->req, requestCmd); + new_pkt->dataStatic(pkt->getPtr()); + delete pkt->senderState; + delete pkt; + + // New SenderState for the memory access + new_pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index, + nullptr); + + // translation is done. Schedule the mem_req_event at the appropriate + // cycle to send the timing memory request to ruby + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index], + new_pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr()); + + computeUnit->schedule(mem_req_event, curTick() + + computeUnit->req_tick_latency); + + return true; +} + +const char* +ComputeUnit::DataPort::MemReqEvent::description() const +{ + return "ComputeUnit memory request event"; +} + +void +ComputeUnit::DataPort::MemReqEvent::process() +{ + SenderState *sender_state = safe_cast(pkt->senderState); + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit; + + if (!(dataPort->sendTimingReq(pkt))) { + dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst)); + + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, dataPort->index, + pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, dataPort->index, + pkt->req->getPaddr()); + } +} + +/* + * The initial translation request could have been rejected, + * if queue is not Retry sending the translation + * request. sendRetry() is called from the peer port whenever + * a translation completes. + */ +void +ComputeUnit::DTLBPort::recvReqRetry() +{ + int len = retries.size(); + + DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n", + computeUnit->cu_id, len); + + assert(len > 0); + assert(isStalled()); + // recvReqRetry is an indication that the resource on which this + // port was stalling on is freed. So, remove the stall first + unstallPort(); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front(); + Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); + DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr); + + if (!sendTimingReq(pkt)) { + // Stall port + stallPort(); + DPRINTF(GPUTLB, ": failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": successful\n"); + retries.pop_front(); + } + } +} + +bool +ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) +{ + Addr line M5_VAR_USED = pkt->req->getPaddr(); + DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n", + computeUnit->cu_id, pkt->req->getVaddr(), line); + + assert(pkt->senderState); + + // pop off the TLB translation state + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast(pkt->senderState); + + bool success = translation_state->tlbEntry->valid; + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + pkt->senderState = translation_state->saved; + delete translation_state; + + // use the original sender state to know how to close this transaction + ITLBPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + // get the wavefront associated with this translation request + Wavefront *wavefront = sender_state->wavefront; + delete pkt->senderState; + + if (success) { + // pkt is reused in fetch(), don't delete it here. However, we must + // reset the command to be a request so that it can be sent through + // the cu's master port + assert(pkt->cmd == MemCmd::ReadResp); + pkt->cmd = MemCmd::ReadReq; + + computeUnit->fetchStage.fetch(pkt, wavefront); + } else { + if (wavefront->dropFetch) { + assert(wavefront->instructionBuffer.empty()); + wavefront->dropFetch = false; + } + + wavefront->pendingFetch = 0; + } + + return true; +} + +/* + * The initial translation request could have been rejected, if + * queue is not empty. Retry sending the translation + * request. sendRetry() is called from the peer port whenever + * a translation completes. + */ +void +ComputeUnit::ITLBPort::recvReqRetry() +{ + + int len = retries.size(); + DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len); + + assert(len > 0); + assert(isStalled()); + + // recvReqRetry is an indication that the resource on which this + // port was stalling on is freed. So, remove the stall first + unstallPort(); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front(); + Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); + DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr); + + if (!sendTimingReq(pkt)) { + stallPort(); // Stall port + DPRINTF(GPUTLB, ": failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": successful\n"); + retries.pop_front(); + } + } +} + +void +ComputeUnit::regStats() +{ + tlbCycles + .name(name() + ".tlb_cycles") + .desc("total number of cycles for all uncoalesced requests") + ; + + tlbRequests + .name(name() + ".tlb_requests") + .desc("number of uncoalesced requests") + ; + + tlbLatency + .name(name() + ".avg_translation_latency") + .desc("Avg. translation latency for data translations") + ; + + tlbLatency = tlbCycles / tlbRequests; + + hitsPerTLBLevel + .init(4) + .name(name() + ".TLB_hits_distribution") + .desc("TLB hits distribution (0 for page table, x for Lx-TLB") + ; + + // fixed number of TLB levels + for (int i = 0; i < 4; ++i) { + if (!i) + hitsPerTLBLevel.subname(i,"page_table"); + else + hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); + } + + execRateDist + .init(0, 10, 2) + .name(name() + ".inst_exec_rate") + .desc("Instruction Execution Rate: Number of executed vector " + "instructions per cycle") + ; + + ldsBankConflictDist + .init(0, VSZ, 2) + .name(name() + ".lds_bank_conflicts") + .desc("Number of bank conflicts per LDS memory packet") + ; + + ldsBankAccesses + .name(name() + ".lds_bank_access_cnt") + .desc("Total number of LDS bank accesses") + ; + + pageDivergenceDist + // A wavefront can touch 1 to VSZ pages per memory instruction. + // The number of pages per bin can be configured (here it's 4). + .init(1, VSZ, 4) + .name(name() + ".page_divergence_dist") + .desc("pages touched per wf (over all mem. instr.)") + ; + + controlFlowDivergenceDist + .init(1, VSZ, 4) + .name(name() + ".warp_execution_dist") + .desc("number of lanes active per instruction (oval all instructions)") + ; + + activeLanesPerGMemInstrDist + .init(1, VSZ, 4) + .name(name() + ".gmem_lanes_execution_dist") + .desc("number of active lanes per global memory instruction") + ; + + activeLanesPerLMemInstrDist + .init(1, VSZ, 4) + .name(name() + ".lmem_lanes_execution_dist") + .desc("number of active lanes per local memory instruction") + ; + + numInstrExecuted + .name(name() + ".num_instr_executed") + .desc("number of instructions executed") + ; + + numVecOpsExecuted + .name(name() + ".num_vec_ops_executed") + .desc("number of vec ops executed (e.g. VSZ/inst)") + ; + + totalCycles + .name(name() + ".num_total_cycles") + .desc("number of cycles the CU ran for") + ; + + ipc + .name(name() + ".ipc") + .desc("Instructions per cycle (this CU only)") + ; + + vpc + .name(name() + ".vpc") + .desc("Vector Operations per cycle (this CU only)") + ; + + numALUInstsExecuted + .name(name() + ".num_alu_insts_executed") + .desc("Number of dynamic non-GM memory insts executed") + ; + + wgBlockedDueLdsAllocation + .name(name() + ".wg_blocked_due_lds_alloc") + .desc("Workgroup blocked due to LDS capacity") + ; + + ipc = numInstrExecuted / totalCycles; + vpc = numVecOpsExecuted / totalCycles; + + numTimesWgBlockedDueVgprAlloc + .name(name() + ".times_wg_blocked_due_vgpr_alloc") + .desc("Number of times WGs are blocked due to VGPR allocation per SIMD") + ; + + dynamicGMemInstrCnt + .name(name() + ".global_mem_instr_cnt") + .desc("dynamic global memory instructions count") + ; + + dynamicLMemInstrCnt + .name(name() + ".local_mem_instr_cnt") + .desc("dynamic local memory intruction count") + ; + + numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - + dynamicLMemInstrCnt; + + completedWfs + .name(name() + ".num_completed_wfs") + .desc("number of completed wavefronts") + ; + + numCASOps + .name(name() + ".num_CAS_ops") + .desc("number of compare and swap operations") + ; + + numFailedCASOps + .name(name() + ".num_failed_CAS_ops") + .desc("number of compare and swap operations that failed") + ; + + // register stats of pipeline stages + fetchStage.regStats(); + scoreboardCheckStage.regStats(); + scheduleStage.regStats(); + execStage.regStats(); + + // register stats of memory pipeline + globalMemoryPipe.regStats(); + localMemoryPipe.regStats(); +} + +void +ComputeUnit::updatePageDivergenceDist(Addr addr) +{ + Addr virt_page_addr = roundDown(addr, TheISA::PageBytes); + + if (!pagesTouched.count(virt_page_addr)) + pagesTouched[virt_page_addr] = 1; + else + pagesTouched[virt_page_addr]++; +} + +void +ComputeUnit::CUExitCallback::process() +{ + if (computeUnit->countPages) { + std::ostream *page_stat_file = + simout.create(computeUnit->name().c_str()); + + *page_stat_file << "page, wavefront accesses, workitem accesses" << + std::endl; + + for (auto iter : computeUnit->pageAccesses) { + *page_stat_file << std::hex << iter.first << ","; + *page_stat_file << std::dec << iter.second.first << ","; + *page_stat_file << std::dec << iter.second.second << std::endl; + } + } + } + +bool +ComputeUnit::isDone() const +{ + for (int i = 0; i < numSIMDs; ++i) { + if (!isSimdDone(i)) { + return false; + } + } + + bool glbMemBusRdy = true; + for (int j = 0; j < numGlbMemUnits; ++j) { + glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy(); + } + bool locMemBusRdy = true; + for (int j = 0; j < numLocMemUnits; ++j) { + locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy(); + } + + if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() || + !globalMemoryPipe.isGMStRespFIFOWrRdy() || + !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy() + || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() || + !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) { + return false; + } + + return true; +} + +int32_t +ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const +{ + return lds.getRefCounter(dispatchId, wgId); +} + +bool +ComputeUnit::isSimdDone(uint32_t simdId) const +{ + assert(simdId < numSIMDs); + + for (int i=0; i < numGlbMemUnits; ++i) { + if (!vrfToGlobalMemPipeBus[i].rdy()) + return false; + } + for (int i=0; i < numLocMemUnits; ++i) { + if (!vrfToLocalMemPipeBus[i].rdy()) + return false; + } + if (!aluPipe[simdId].rdy()) { + return false; + } + + for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){ + if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) { + return false; + } + } + + return true; +} + +/** + * send a general request to the LDS + * make sure to look at the return value here as your request might be + * NACK'd and returning false means that you have to have some backup plan + */ +bool +ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst) +{ + // this is just a request to carry the GPUDynInstPtr + // back and forth + Request *newRequest = new Request(); + newRequest->setPaddr(0x0); + + // ReadReq is not evaluted by the LDS but the Packet ctor requires this + PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq); + + // This is the SenderState needed upon return + newPacket->senderState = new LDSPort::SenderState(gpuDynInst); + + return ldsPort->sendTimingReq(newPacket); +} + +/** + * get the result of packets sent to the LDS when they return + */ +bool +ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet) +{ + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast(packet->senderState); + + fatal_if(!senderState, "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + delete packet->senderState; + delete packet->req; + delete packet; + + computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst); + return true; +} + +/** + * attempt to send this packet, either the port is already stalled, the request + * is nack'd and must stall or the request goes through + * when a request cannot be sent, add it to the retries queue + */ +bool +ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt) +{ + ComputeUnit::LDSPort::SenderState *sender_state = + dynamic_cast(pkt->senderState); + fatal_if(!sender_state, "packet without a valid sender state"); + + GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst(); + + if (isStalled()) { + fatal_if(retries.empty(), "must have retries waiting to be stalled"); + + retries.push(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); + return false; + } else if (!MasterPort::sendTimingReq(pkt)) { + // need to stall the LDS port until a recvReqRetry() is received + // this indicates that there is more space + stallPort(); + retries.push(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, pkt->req->getPaddr()); + return false; + } else { + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, pkt->req->getPaddr()); + return true; + } +} + +/** + * the bus is telling the port that there is now space so retrying stalled + * requests should work now + * this allows the port to have a request be nack'd and then have the receiver + * say when there is space, rather than simply retrying the send every cycle + */ +void +ComputeUnit::LDSPort::recvReqRetry() +{ + auto queueSize = retries.size(); + + DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n", + computeUnit->cu_id, queueSize); + + fatal_if(queueSize < 1, + "why was there a recvReqRetry() with no pending reqs?"); + fatal_if(!isStalled(), + "recvReqRetry() happened when the port was not stalled"); + + unstallPort(); + + while (!retries.empty()) { + PacketPtr packet = retries.front(); + + DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id); + + if (!MasterPort::sendTimingReq(packet)) { + // Stall port + stallPort(); + DPRINTF(GPUPort, ": LDS send failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": LDS send successful\n"); + retries.pop(); + } + } +} diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh new file mode 100644 index 000000000..f47c27a0a --- /dev/null +++ b/src/gpu-compute/compute_unit.hh @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#ifndef __COMPUTE_UNIT_HH__ +#define __COMPUTE_UNIT_HH__ + +#include +#include +#include +#include + +#include "base/callback.hh" +#include "base/statistics.hh" +#include "base/types.hh" +#include "enums/PrefetchType.hh" +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/fetch_stage.hh" +#include "gpu-compute/global_memory_pipeline.hh" +#include "gpu-compute/local_memory_pipeline.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/schedule_stage.hh" +#include "gpu-compute/scoreboard_check_stage.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" + +static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; +static const int MAX_WIDTH_FOR_MEM_INST = 32; + +class NDRange; +class Shader; +class VectorRegisterFile; + +struct ComputeUnitParams; + +enum EXEC_POLICY +{ + OLDEST = 0, + RR +}; + +// List of execution units +enum EXEC_UNIT +{ + SIMD0 = 0, + SIMD1, + SIMD2, + SIMD3, + GLBMEM_PIPE, + LDSMEM_PIPE, + NUM_UNITS +}; + +enum TLB_CACHE +{ + TLB_MISS_CACHE_MISS = 0, + TLB_MISS_CACHE_HIT, + TLB_HIT_CACHE_MISS, + TLB_HIT_CACHE_HIT +}; + +class ComputeUnit : public MemObject +{ + public: + FetchStage fetchStage; + ScoreboardCheckStage scoreboardCheckStage; + ScheduleStage scheduleStage; + ExecStage execStage; + GlobalMemPipeline globalMemoryPipe; + LocalMemPipeline localMemoryPipe; + + // Buffers used to communicate between various pipeline stages + + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list. readyList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: make enum to index readyList + std::vector> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList. waveStatusList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: convert std::pair to a class to increase readability + std::vector>> waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + // TODO: convert std::pair to a class to increase readability + std::vector> dispatchList; + + int rrNextMemID; // used by RR WF exec policy to cycle through WF's + int rrNextALUWp; + typedef ComputeUnitParams Params; + std::vector> wfList; + int cu_id; + + // array of vector register files, one per SIMD + std::vector vrf; + // Number of vector ALU units (SIMDs) in CU + int numSIMDs; + // number of pipe stages for bypassing data to next dependent single + // precision vector instruction inside the vector ALU pipeline + int spBypassPipeLength; + // number of pipe stages for bypassing data to next dependent double + // precision vector instruction inside the vector ALU pipeline + int dpBypassPipeLength; + // number of cycles per issue period + int issuePeriod; + + // Number of global and local memory execution resources in CU + int numGlbMemUnits; + int numLocMemUnits; + // tracks the last cycle a vector instruction was executed on a SIMD + std::vector lastExecCycle; + + // true if we allow a separate TLB per lane + bool perLaneTLB; + // if 0, TLB prefetching is off. + int prefetchDepth; + // if fixed-stride prefetching, this is the stride. + int prefetchStride; + + class LastVaddrWave + { + public: + Addr vaddrs[VSZ]; + Addr& operator[](int idx) { + return vaddrs[idx]; + } + + LastVaddrWave() { + for (int i = 0; i < VSZ; ++i) + vaddrs[i] = 0; + } + }; + + LastVaddrWave lastVaddrCU; + std::vector lastVaddrPhase; + std::vector>> lastVaddrWF; + Enums::PrefetchType prefetchType; + EXEC_POLICY exec_policy; + + bool xact_cas_mode; + bool debugSegFault; + bool functionalTLB; + bool localMemBarrier; + + /* + * for Counting page accesses + * + * cuExitCallback inherits from Callback. When you register a callback + * function as an exit callback, it will get added to an exit callback + * queue, such that on simulation exit, all callbacks in the callback + * queue will have their process() function called. + */ + bool countPages; + + Shader *shader; + uint32_t barrier_id; + // vector of Vector ALU (MACC) pipelines + std::vector aluPipe; + // minimum issue period per SIMD unit (in cycles) + std::vector wfWait; + + // Resource control for Vector Register File->Global Memory pipe buses + std::vector vrfToGlobalMemPipeBus; + // Resource control for Vector Register File->Local Memory pipe buses + std::vector vrfToLocalMemPipeBus; + int nextGlbMemBus; + int nextLocMemBus; + // Resource control for global memory to VRF data/address bus + WaitClass glbMemToVrfBus; + // Resource control for local memory to VRF data/address bus + WaitClass locMemToVrfBus; + + uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes + uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes + uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store + uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load + + Tick req_tick_latency; + Tick resp_tick_latency; + + // number of vector registers being reserved for each SIMD unit + std::vector vectorRegsReserved; + // number of vector registers per SIMD unit + uint32_t numVecRegsPerSimd; + // Support for scheduling VGPR status update events + std::vector > regIdxVec; + std::vector timestampVec; + std::vector statusVec; + + void + registerEvent(uint32_t simdId, + uint32_t regIdx, + uint32_t operandSize, + uint64_t when, + uint8_t newStatus) { + regIdxVec.push_back(std::make_pair(simdId, regIdx)); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + if (operandSize > 4) { + regIdxVec.push_back(std::make_pair(simdId, + ((regIdx + 1) % + numVecRegsPerSimd))); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + } + } + + void updateEvents(); + + // this hash map will keep track of page divergence + // per memory instruction per wavefront. The hash map + // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. + std::map pagesTouched; + + ComputeUnit(const Params *p); + ~ComputeUnit(); + int spBypassLength() { return spBypassPipeLength; }; + int dpBypassLength() { return dpBypassPipeLength; }; + int storeBusLength() { return numCyclesPerStoreTransfer; }; + int loadBusLength() { return numCyclesPerLoadTransfer; }; + int wfSize() const { return wavefrontSize; }; + + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + void exec(); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void FillKernelState(Wavefront *w, NDRange *ndr); + + void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal); + + void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart); + + void StartWorkgroup(NDRange *ndr); + int ReadyWorkgroup(NDRange *ndr); + + bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } + bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } + bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } + int GlbMemUnitId() { return GLBMEM_PIPE; } + int ShrMemUnitId() { return LDSMEM_PIPE; } + int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } + int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } + /* This function cycles through all the wavefronts in all the phases to see + * if all of the wavefronts which should be associated with one barrier + * (denoted with _barrier_id), are all at the same barrier in the program + * (denoted by bcnt). When the number at the barrier matches bslots, then + * return true. + */ + int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); + bool cedeSIMD(int simdId, int wfSlotId); + + template void doSmReturn(GPUDynInstPtr gpuDynInst); + virtual void init(); + void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, + bool kernelLaunch=true, + RequestPtr req=nullptr); + void handleMemPacket(PacketPtr pkt, int memport_index); + bool processTimingPacket(PacketPtr pkt); + void processFetchReturn(PacketPtr pkt); + void updatePageDivergenceDist(Addr addr); + + MasterID masterId() { return _masterId; } + + bool isDone() const; + bool isSimdDone(uint32_t) const; + + protected: + MasterID _masterId; + + LdsState &lds; + + public: + // the following stats compute the avg. TLB accesslatency per + // uncoalesced request (only for data) + Stats::Scalar tlbRequests; + Stats::Scalar tlbCycles; + Stats::Formula tlbLatency; + // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. + Stats::Vector hitsPerTLBLevel; + + Stats::Scalar ldsBankAccesses; + Stats::Distribution ldsBankConflictDist; + + // over all memory instructions executed over all wavefronts + // how many touched 0-4 pages, 4-8, ..., 60-64 pages + Stats::Distribution pageDivergenceDist; + Stats::Scalar dynamicGMemInstrCnt; + Stats::Scalar dynamicLMemInstrCnt; + + Stats::Scalar wgBlockedDueLdsAllocation; + // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active + // when the instruction is committed, this number is still incremented by 1 + Stats::Scalar numInstrExecuted; + // Number of cycles among successive instruction executions across all + // wavefronts of the same CU + Stats::Distribution execRateDist; + // number of individual vector operations executed + Stats::Scalar numVecOpsExecuted; + // Total cycles that something is running on the GPU + Stats::Scalar totalCycles; + Stats::Formula vpc; // vector ops per cycle + Stats::Formula ipc; // vector instructions per cycle + Stats::Distribution controlFlowDivergenceDist; + Stats::Distribution activeLanesPerGMemInstrDist; + Stats::Distribution activeLanesPerLMemInstrDist; + // number of vector ALU instructions received + Stats::Formula numALUInstsExecuted; + // number of times a WG can not start due to lack of free VGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueVgprAlloc; + Stats::Scalar numCASOps; + Stats::Scalar numFailedCASOps; + Stats::Scalar completedWfs; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer: Defined in the Scoreboard stage, consumed + // by the Execute stage. + std::vector vectorAluInstAvail; + // number of available (oldest) LDS instructions that could have + // been issued to the LDS at a specific issue slot + int shrMemInstAvail; + // number of available Global memory instructions that could have + // been issued to TCP at a specific issue slot + int glbMemInstAvail; + + void + regStats(); + + LdsState & + getLds() const + { + return lds; + } + + int32_t + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; + + bool + sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); + + typedef std::unordered_map> pageDataStruct; + pageDataStruct pageAccesses; + + class CUExitCallback : public Callback + { + private: + ComputeUnit *computeUnit; + + public: + virtual ~CUExitCallback() { } + + CUExitCallback(ComputeUnit *_cu) + { + computeUnit = _cu; + } + + virtual void + process(); + }; + + CUExitCallback *cuExitCallback; + + /** Data access Port **/ + class DataPort : public MasterPort + { + public: + DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + GPUDynInstPtr _gpuDynInst; + int port_index; + Packet::SenderState *saved; + + SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, + Packet::SenderState *sender_state=nullptr) + : _gpuDynInst(gpuDynInst), + port_index(_port_index), + saved(sender_state) { } + }; + + class MemReqEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemReqEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + class MemRespEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemRespEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + std::deque> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + + }; + + // Instruction cache access port + class SQCPort : public MasterPort + { + public: + SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + Wavefront *wavefront; + Packet::SenderState *saved; + + SenderState(Wavefront *_wavefront, Packet::SenderState + *sender_state=nullptr) + : wavefront(_wavefront), saved(sender_state) { } + }; + + std::deque> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + }; + + /** Data TLB port **/ + class DTLBPort : public MasterPort + { + public: + DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index), stalled(false) + { } + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // the memInst that this is associated with + GPUDynInstPtr _gpuDynInst; + + // the lane in the memInst this is associated with, so we send + // the memory request down the right port + int portIndex; + + // constructor used for packets involved in timing accesses + SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) + : _gpuDynInst(gpuDynInst), portIndex(port_index) { } + + }; + + protected: + ComputeUnit *computeUnit; + int index; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + class ITLBPort : public MasterPort + { + public: + ITLBPort(const std::string &_name, ComputeUnit *_cu) + : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } + + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // The wavefront associated with this request + Wavefront *wavefront; + + SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } + }; + + protected: + ComputeUnit *computeUnit; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + /** + * the port intended to communicate between the CU and its LDS + */ + class LDSPort : public MasterPort + { + public: + LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) + : MasterPort(_name, _cu, _id), computeUnit(_cu) + { + } + + bool isStalled() const { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the requests that were + * not successfully sent. + */ + std::queue retries; + + /** + * SenderState is information carried along with the packet, esp. the + * GPUDynInstPtr + */ + class SenderState: public Packet::SenderState + { + protected: + // The actual read/write/atomic request that goes with this command + GPUDynInstPtr _gpuDynInst = nullptr; + + public: + SenderState(GPUDynInstPtr gpuDynInst): + _gpuDynInst(gpuDynInst) + { + } + + GPUDynInstPtr + getMemInst() const + { + return _gpuDynInst; + } + }; + + virtual bool + sendTimingReq(PacketPtr pkt); + + protected: + + bool stalled = false; ///< whether or not it is stalled + + ComputeUnit *computeUnit; + + virtual bool + recvTimingResp(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) { return 0; } + + virtual void + recvFunctional(PacketPtr pkt) + { + } + + virtual void + recvRangeChange() + { + } + + virtual void + recvReqRetry(); + }; + + /** The port to access the Local Data Store + * Can be connected to a LDS object + */ + LDSPort *ldsPort = nullptr; + + LDSPort * + getLdsPort() const + { + return ldsPort; + } + + /** The memory port for SIMD data accesses. + * Can be connected to PhysMem for Ruby for timing simulations + */ + std::vector memPort; + // port to the TLB hierarchy (i.e., the L1 TLB) + std::vector tlbPort; + // port to the SQC (i.e. the I-cache) + SQCPort *sqcPort; + // port to the SQC TLB (there's a separate TLB for each I-cache) + ITLBPort *sqcTLBPort; + + virtual BaseMasterPort& + getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "memory_port") { + memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *memPort[idx]; + } else if (if_name == "translation_port") { + tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *tlbPort[idx]; + } else if (if_name == "sqc_port") { + sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *sqcPort; + } else if (if_name == "sqc_tlb_port") { + sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); + return *sqcTLBPort; + } else if (if_name == "ldsPort") { + if (ldsPort) { + fatal("an LDS port was already allocated"); + } + ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); + return *ldsPort; + } else { + panic("incorrect port name"); + } + } + + // xact_cas_load() + class waveIdentifier + { + public: + waveIdentifier() { } + waveIdentifier(int _simdId, int _wfSlotId) + : simdId(_simdId), wfSlotId(_wfSlotId) { } + + int simdId; + int wfSlotId; + }; + + class waveQueue + { + public: + std::list waveIDQueue; + }; + std::map xactCasLoadMap; + + uint64_t getAndIncSeqNum() { return globalSeqNum++; } + + private: + uint64_t globalSeqNum; + int wavefrontSize; +}; + +#endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc new file mode 100644 index 000000000..f3f2d2927 --- /dev/null +++ b/src/gpu-compute/condition_register_state.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/condition_register_state.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +ConditionRegisterState::ConditionRegisterState() +{ + computeUnit = nullptr; + c_reg.clear(); + busy.clear(); +} + +void +ConditionRegisterState::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + _name = computeUnit->name() + ".CondRegState"; +} + +void +ConditionRegisterState::init(uint32_t _size) +{ + c_reg.resize(_size); + busy.resize(_size, 0); +} + +void +ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w) +{ + // iterate over all operands + for (auto i = 0; i < ii->getNumOperands(); ++i) { + // is this a condition register destination operand? + if (ii->isCondRegister(i) && ii->isDstOperand(i)) { + // mark the register as busy + markReg(ii->getRegisterIndex(i), 1); + uint32_t pipeLen = w->computeUnit->spBypassLength(); + + // schedule an event for marking the register as ready + w->computeUnit-> + registerEvent(w->simdId, ii->getRegisterIndex(i), + ii->getOperandSize(i), + w->computeUnit->shader->tick_cnt + + w->computeUnit->shader->ticks(pipeLen), 0); + } + } +} diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh new file mode 100644 index 000000000..139874a66 --- /dev/null +++ b/src/gpu-compute/condition_register_state.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __CONDITION_REGISTER_STATE_HH__ +#define __CONDITION_REGISTER_STATE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" + +class ComputeUnit; +class GPUStaticInst; +class Shader; +class Wavefront; + +// Condition Register State (used only when executing HSAIL) +class ConditionRegisterState +{ + public: + ConditionRegisterState(); + void init(uint32_t _size); + const std::string name() const { return _name; } + void setParent(ComputeUnit *_computeUnit); + void regStats() { } + + template + T + read(int regIdx, int threadId) + { + bool tmp = c_reg[regIdx][threadId]; + T *p0 = (T*)(&tmp); + + return *p0; + } + + template + void + write(int regIdx, int threadId, T value) + { + c_reg[regIdx][threadId] = (bool)(value & 0x01); + } + + void + markReg(int regIdx, uint8_t value) + { + busy.at(regIdx) = value; + } + + uint8_t + regBusy(int idx) + { + uint8_t status = busy.at(idx); + return status; + } + + int numRegs() { return c_reg.size(); } + void exec(GPUStaticInst *ii, Wavefront *w); + + private: + ComputeUnit* computeUnit; + std::string _name; + // Condition Register state + std::vector c_reg; + // flag indicating if a register is busy + std::vector busy; +}; + +#endif diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc new file mode 100644 index 000000000..55e4be72a --- /dev/null +++ b/src/gpu-compute/dispatcher.cc @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + + +#include "gpu-compute/dispatcher.hh" + +#include "cpu/base.hh" +#include "debug/GPUDisp.hh" +#include "gpu-compute/cl_driver.hh" +#include "gpu-compute/cl_event.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/packet_access.hh" + +GpuDispatcher *GpuDispatcher::instance = nullptr; + +GpuDispatcher::GpuDispatcher(const Params *p) + : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")), + pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), + dispatchCount(0), dispatchActive(false), cpu(p->cpu), + shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this) +{ + shader->handshake(this); + driver->handshake(this); + + ndRange.wg_disp_rem = false; + ndRange.globalWgId = 0; + + schedule(&tickEvent, 0); + + // translation port for the dispatcher + tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); + + num_kernelLaunched + .name(name() + ".num_kernel_launched") + .desc("number of kernel launched") + ; +} + +GpuDispatcher *GpuDispatcherParams::create() +{ + GpuDispatcher *dispatcher = new GpuDispatcher(this); + GpuDispatcher::setInstance(dispatcher); + + return GpuDispatcher::getInstance(); +} + +void +GpuDispatcher::serialize(CheckpointOut &cp) const +{ + Tick event_tick = 0; + + if (ndRange.wg_disp_rem) + fatal("Checkpointing not supported during active workgroup execution"); + + if (tickEvent.scheduled()) + event_tick = tickEvent.when(); + + SERIALIZE_SCALAR(event_tick); + +} + +void +GpuDispatcher::unserialize(CheckpointIn &cp) +{ + Tick event_tick; + + if (tickEvent.scheduled()) + deschedule(&tickEvent); + + UNSERIALIZE_SCALAR(event_tick); + + if (event_tick) + schedule(&tickEvent, event_tick); +} + +AddrRangeList +GpuDispatcher::getAddrRanges() const +{ + AddrRangeList ranges; + + DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", + pioAddr, pioSize); + + ranges.push_back(RangeSize(pioAddr, pioSize)); + + return ranges; +} + +Tick +GpuDispatcher::read(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr); + assert(pkt->getAddr() < pioAddr + pioSize); + + int offset = pkt->getAddr() - pioAddr; + pkt->allocate(); + + DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); + + if (offset < 8) { + assert(!offset); + assert(pkt->getSize() == 8); + + uint64_t retval = dispatchActive; + pkt->set(retval); + } else { + offset -= 8; + assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); + char *curTaskPtr = (char*)&curTask; + + memcpy(pkt->getPtr(), curTaskPtr + offset, pkt->getSize()); + } + + pkt->makeAtomicResponse(); + + return pioDelay; +} + +Tick +GpuDispatcher::write(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr); + assert(pkt->getAddr() < pioAddr + pioSize); + + int offset = pkt->getAddr() - pioAddr; + +#if TRACING_ON + uint64_t data_val = 0; + + switch (pkt->getSize()) { + case 1: + data_val = pkt->get(); + break; + case 2: + data_val = pkt->get(); + break; + case 4: + data_val = pkt->get(); + break; + case 8: + data_val = pkt->get(); + break; + default: + DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); + } + + DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, + pkt->getSize()); +#endif + if (!offset) { + static int nextId = 0; + + // The depends field of the qstruct, which was previously unused, is + // used to communicate with simulated application. + if (curTask.depends) { + HostState hs; + shader->ReadMem((uint64_t)(curTask.depends), &hs, + sizeof(HostState), 0); + + // update event start time (in nano-seconds) + uint64_t start = curTick() / 1000; + + shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), + &start, sizeof(uint64_t), 0); + } + + // launch kernel + ++num_kernelLaunched; + + NDRange *ndr = &(ndRangeMap[nextId]); + // copy dispatch info + ndr->q = curTask; + + // update the numDispTask polled by the runtime + accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); + + ndr->numWgTotal = 1; + + for (int i = 0; i < 3; ++i) { + ndr->wgId[i] = 0; + ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); + ndr->numWgTotal *= ndr->numWg[i]; + } + + ndr->numWgCompleted = 0; + ndr->globalWgId = 0; + ndr->wg_disp_rem = true; + ndr->execDone = false; + ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; + ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; + ndr->dispatchId = nextId; + ndr->curTid = pkt->req->threadId(); + DPRINTF(GPUDisp, "launching kernel %d\n",nextId); + execIds.push(nextId); + ++nextId; + + dispatchActive = true; + + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->ticks(1)); + } + } else { + // populate current task struct + // first 64 bits are launch reg + offset -= 8; + assert(offset < sizeof(HsaQueueEntry)); + char *curTaskPtr = (char*)&curTask; + memcpy(curTaskPtr + offset, pkt->getPtr(), pkt->getSize()); + } + + pkt->makeAtomicResponse(); + + return pioDelay; +} + + +BaseMasterPort& +GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "translation_port") { + return *tlbPort; + } + + return DmaDevice::getMasterPort(if_name, idx); +} + +void +GpuDispatcher::exec() +{ + int fail_count = 0; + + // There are potentially multiple outstanding kernel launches. + // It is possible that the workgroups in a different kernel + // can fit on the GPU even if another kernel's workgroups cannot + DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); + + while (execIds.size() > fail_count) { + int execId = execIds.front(); + + while (ndRangeMap[execId].wg_disp_rem) { + //update the thread context + shader->updateThreadContext(ndRangeMap[execId].curTid); + + // attempt to dispatch_workgroup + if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { + // if we failed try the next kernel, + // it may have smaller workgroups. + // put it on the queue to rety latter + DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); + execIds.push(execId); + ++fail_count; + break; + } + } + // let's try the next kernel_id + execIds.pop(); + } + + DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); + + if (doneIds.size() && cpu) { + shader->hostWakeUp(cpu); + } + + while (doneIds.size()) { + // wakeup the CPU if any Kernels completed this cycle + DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); + doneIds.pop(); + } +} + +void +GpuDispatcher::notifyWgCompl(Wavefront *w) +{ + int kern_id = w->kern_id; + DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); + assert(ndRangeMap[kern_id].dispatchId == kern_id); + ndRangeMap[kern_id].numWgCompleted++; + + if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { + ndRangeMap[kern_id].execDone = true; + doneIds.push(kern_id); + + if (ndRangeMap[kern_id].addrToNotify) { + accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, + 0); + } + + accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); + + // update event end time (in nano-seconds) + if (ndRangeMap[kern_id].q.depends) { + HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; + uint64_t event; + shader->ReadMem((uint64_t)(&host_state->event), &event, + sizeof(uint64_t), 0); + + uint64_t end = curTick() / 1000; + + shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, + sizeof(uint64_t), 0); + } + } + + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->ticks(1)); + } +} + +void +GpuDispatcher::scheduleDispatch() +{ + if (!tickEvent.scheduled()) + schedule(&tickEvent, curTick() + shader->ticks(1)); +} + +void +GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) +{ + if (cpu) { + if (off) { + shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, + true); + val += off; + } + + shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); + } else { + panic("Cannot find host"); + } +} + +GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher) + : Event(CPU_Tick_Pri), dispatcher(_dispatcher) +{ +} + +void +GpuDispatcher::TickEvent::process() +{ + dispatcher->exec(); +} + +const char* +GpuDispatcher::TickEvent::description() const +{ + return "GPU Dispatcher tick"; +} + +// helper functions for driver to retrieve GPU attributes +int +GpuDispatcher::getNumCUs() +{ + return shader->cuList.size(); +} + +void +GpuDispatcher::setFuncargsSize(int funcargs_size) +{ + shader->funcargs_size = funcargs_size; +} diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh new file mode 100644 index 000000000..76f932655 --- /dev/null +++ b/src/gpu-compute/dispatcher.hh @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + +#ifndef __GPU_DISPATCHER_HH__ +#define __GPU_DISPATCHER_HH__ + +#include +#include + +#include "base/statistics.hh" +#include "dev/dma_device.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/ndrange.hh" +#include "gpu-compute/qstruct.hh" +#include "mem/port.hh" +#include "params/GpuDispatcher.hh" + +class BaseCPU; +class Shader; + +class GpuDispatcher : public DmaDevice +{ + public: + typedef GpuDispatcherParams Params; + + class TickEvent : public Event + { + private: + GpuDispatcher *dispatcher; + + public: + TickEvent(GpuDispatcher *); + void process(); + const char *description() const; + }; + + MasterID masterId() { return _masterId; } + + protected: + MasterID _masterId; + + // Base and length of PIO register space + Addr pioAddr; + Addr pioSize; + Tick pioDelay; + + HsaQueueEntry curTask; + + std::unordered_map ndRangeMap; + NDRange ndRange; + + // list of kernel_ids to launch + std::queue execIds; + // list of kernel_ids that have finished + std::queue doneIds; + + uint64_t dispatchCount; + // is there a kernel in execution? + bool dispatchActive; + + BaseCPU *cpu; + Shader *shader; + ClDriver *driver; + TickEvent tickEvent; + + static GpuDispatcher *instance; + + // sycall emulation mode can have only 1 application running(?) + // else we have to do some pid based tagging + // unused + typedef std::unordered_map TranslationBuffer; + TranslationBuffer tlb; + + public: + /*statistics*/ + Stats::Scalar num_kernelLaunched; + GpuDispatcher(const Params *p); + + ~GpuDispatcher() { } + + void exec(); + virtual void serialize(CheckpointOut &cp) const; + virtual void unserialize(CheckpointIn &cp); + void notifyWgCompl(Wavefront *w); + void scheduleDispatch(); + void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off); + + // using singleton so that glue code can pass pointer locations + // to the dispatcher. when there are multiple dispatchers, we can + // call something like getInstance(index) + static void + setInstance(GpuDispatcher *_instance) + { + instance = _instance; + } + + static GpuDispatcher* getInstance() { return instance; } + + class TLBPort : public MasterPort + { + public: + + TLBPort(const std::string &_name, GpuDispatcher *_dispatcher) + : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { } + + protected: + GpuDispatcher *dispatcher; + + virtual bool recvTimingResp(PacketPtr pkt) { return true; } + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry() { } + + }; + + TLBPort *tlbPort; + + virtual BaseMasterPort& getMasterPort(const std::string &if_name, + PortID idx); + + AddrRangeList getAddrRanges() const; + Tick read(PacketPtr pkt); + Tick write(PacketPtr pkt); + + // helper functions to retrieve/set GPU attributes + int getNumCUs(); + void setFuncargsSize(int funcargs_size); +}; + +#endif // __GPU_DISPATCHER_HH__ diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc new file mode 100644 index 000000000..c2b95f85e --- /dev/null +++ b/src/gpu-compute/exec_stage.cc @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#include "gpu-compute/exec_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/wavefront.hh" + +ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), + vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr), + shrMemInstAvail(nullptr), lastTimeInstExecuted(false), + thisTimeInstExecuted(false), instrExecuted (false), + executionResourcesUsed(0) +{ + numTransActiveIdle = 0; + idle_dur = 0; +} + +void +ExecStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ExecStage"; + dispatchList = &computeUnit->dispatchList; + vectorAluInstAvail = &(computeUnit->vectorAluInstAvail); + glbMemInstAvail= &(computeUnit->glbMemInstAvail); + shrMemInstAvail= &(computeUnit->shrMemInstAvail); + idle_dur = 0; +} + +void +ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { + if (stage == IdleExec) { + // count cycles of no vector ALU instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) { + numCyclesWithNoInstrTypeIssued[unitId]++; + } + + // count cycles of no global memory (vector) instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) { + numCyclesWithNoInstrTypeIssued[unitId]++; + (*glbMemInstAvail)--; + } + + // count cycles of no shared memory (vector) instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) { + numCyclesWithNoInstrTypeIssued[unitId]++; + (*shrMemInstAvail)--; + } + } else if (stage == BusyExec) { + // count the number of cycles an instruction to a specific unit + // was issued + numCyclesWithInstrTypeIssued[unitId]++; + thisTimeInstExecuted = true; + instrExecuted = true; + ++executionResourcesUsed; + } else if (stage == PostExec) { + // count the number of transitions from active to idle + if (lastTimeInstExecuted && !thisTimeInstExecuted) { + ++numTransActiveIdle; + } + + if (!lastTimeInstExecuted && thisTimeInstExecuted) { + idleDur.sample(idle_dur); + idle_dur = 0; + } else if (!thisTimeInstExecuted) { + idle_dur++; + } + + lastTimeInstExecuted = thisTimeInstExecuted; + // track the number of cycles we either issued one vector instruction + // or issued no instructions at all + if (instrExecuted) { + numCyclesWithInstrIssued++; + } else { + numCyclesWithNoIssue++; + } + + spc.sample(executionResourcesUsed); + } +} + +void +ExecStage::initStatistics() +{ + instrExecuted = false; + executionResourcesUsed = 0; + thisTimeInstExecuted = false; +} + +void +ExecStage::exec() +{ + initStatistics(); + + for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) { + // if dispatch list for this execution resource is empty, + // skip this execution resource this cycle + if (dispatchList->at(unitId).second == EMPTY) { + collectStatistics(IdleExec, unitId); + continue; + } + + collectStatistics(BusyExec, unitId); + // execute an instruction for the WF + dispatchList->at(unitId).first->exec(); + // clear the dispatch list entry + dispatchList->at(unitId).second = EMPTY; + dispatchList->at(unitId).first = (Wavefront*)nullptr; + } + + collectStatistics(PostExec, 0); +} + +void +ExecStage::regStats() +{ + numTransActiveIdle + .name(name() + ".num_transitions_active_to_idle") + .desc("number of CU transitions from active to idle") + ; + + numCyclesWithNoIssue + .name(name() + ".num_cycles_with_no_issue") + .desc("number of cycles the CU issues nothing") + ; + + numCyclesWithInstrIssued + .name(name() + ".num_cycles_with_instr_issued") + .desc("number of cycles the CU issued at least one instruction") + ; + + spc + .init(0, numSIMDs + numMemUnits, 1) + .name(name() + ".spc") + .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") + ; + + idleDur + .init(0,75,5) + .name(name() + ".idle_duration_in_cycles") + .desc("duration of idle periods in cycles") + ; + + numCyclesWithInstrTypeIssued + .init(numSIMDs + numMemUnits) + .name(name() + ".num_cycles_with_instrtype_issue") + .desc("Number of cycles at least one instruction of specific type " + "issued") + ; + + numCyclesWithNoInstrTypeIssued + .init(numSIMDs + numMemUnits) + .name(name() + ".num_cycles_with_instr_type_no_issue") + .desc("Number of cycles no instruction of specific type issued") + ; + + for (int i = 0; i < numSIMDs; ++i) { + numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + } + + numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM")); + numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM")); + numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); + numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); +} diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh new file mode 100644 index 000000000..2de74366b --- /dev/null +++ b/src/gpu-compute/exec_stage.hh @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#ifndef __EXEC_STAGE_HH__ +#define __EXEC_STAGE_HH__ + +#include +#include +#include + +#include "sim/stats.hh" + +class ComputeUnit; +class Wavefront; +struct ComputeUnitParams; + +enum STAT_STATUS +{ + IdleExec, + BusyExec, + PostExec +}; + +enum DISPATCH_STATUS +{ + EMPTY = 0, + FILLED +}; + +// Execution stage. +// Each execution resource executes the +// wave which is in its dispatch list. +// The schedule stage is responsible for +// adding a wave into each execution resource's +// dispatch list. + +class ExecStage +{ + public: + ExecStage(const ComputeUnitParams* params); + ~ExecStage() { } + void init(ComputeUnit *cu); + void exec(); + + std::string name() { return _name; } + void regStats(); + // number of idle cycles + Stats::Scalar numCyclesWithNoIssue; + // number of busy cycles + Stats::Scalar numCyclesWithInstrIssued; + // number of cycles (per execution unit) during which at least one + // instruction was issued to that unit + Stats::Vector numCyclesWithInstrTypeIssued; + // number of idle cycles (per execution unit) during which the unit issued + // no instruction targeting that unit, even though there is at least one + // Wavefront with such an instruction as the oldest + Stats::Vector numCyclesWithNoInstrTypeIssued; + // SIMDs active per cycle + Stats::Distribution spc; + + private: + void collectStatistics(enum STAT_STATUS stage, int unitId); + void initStatistics(); + ComputeUnit *computeUnit; + uint32_t numSIMDs; + + // Number of memory execution resources; + // both global and local memory execution resources in CU + uint32_t numMemUnits; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + std::vector> *dispatchList; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer + std::vector *vectorAluInstAvail; + int *glbMemInstAvail; + int *shrMemInstAvail; + bool lastTimeInstExecuted; + bool thisTimeInstExecuted; + bool instrExecuted; + Stats::Scalar numTransActiveIdle; + Stats::Distribution idleDur; + uint32_t executionResourcesUsed; + uint64_t idle_dur; + std::string _name; +}; + +#endif // __EXEC_STAGE_HH__ diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc new file mode 100644 index 000000000..1f5e6ded3 --- /dev/null +++ b/src/gpu-compute/fetch_stage.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez, Sooraj Puthoor + */ + +#include "gpu-compute/fetch_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/wavefront.hh" + +FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs), + computeUnit(nullptr) +{ + for (int j = 0; j < numSIMDs; ++j) { + FetchUnit newFetchUnit(p); + fetchUnit.push_back(newFetchUnit); + } +} + +FetchStage::~FetchStage() +{ + fetchUnit.clear(); +} + +void +FetchStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".FetchStage"; + + for (int j = 0; j < numSIMDs; ++j) { + fetchUnit[j].bindWaveList(&computeUnit->wfList[j]); + fetchUnit[j].init(computeUnit); + } +} + +void +FetchStage::exec() +{ + for (int j = 0; j < numSIMDs; ++j) { + fetchUnit[j].exec(); + } +} + +void +FetchStage::processFetchReturn(PacketPtr pkt) +{ + ComputeUnit::SQCPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + Wavefront *wavefront = sender_state->wavefront; + + const unsigned num_instructions = pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst); + + instFetchInstReturned.sample(num_instructions); + uint32_t simdId = wavefront->simdId; + fetchUnit[simdId].processFetchReturn(pkt); +} + +void +FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront) +{ + fetchUnit[wavefront->simdId].fetch(pkt, wavefront); +} + +void +FetchStage::regStats() +{ + instFetchInstReturned + .init(1, 32, 1) + .name(name() + ".inst_fetch_instr_returned") + .desc("For each instruction fetch request recieved record how many " + "instructions you got from it") + ; +} diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh new file mode 100644 index 000000000..ce7faa8ac --- /dev/null +++ b/src/gpu-compute/fetch_stage.hh @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez, Sooraj Puthoor + */ + +#ifndef __FETCH_STAGE_HH__ +#define __FETCH_STAGE_HH__ + +#include +#include + +#include "gpu-compute/fetch_unit.hh" + +// Instruction fetch stage. +// All dispatched wavefronts for all SIMDS are analyzed for the +// need to fetch instructions. From the fetch eligible waves, +// one wave is selected from each SIMD and fetch is initiated +// for the selected waves. + +class ComputeUnit; +class Wavefront; + +class FetchStage +{ + public: + FetchStage(const ComputeUnitParams* params); + ~FetchStage(); + void init(ComputeUnit *cu); + void exec(); + void processFetchReturn(PacketPtr pkt); + void fetch(PacketPtr pkt, Wavefront *wave); + + // Stats related variables and methods + std::string name() { return _name; } + void regStats(); + Stats::Distribution instFetchInstReturned; + + private: + uint32_t numSIMDs; + ComputeUnit *computeUnit; + + // List of fetch units. A fetch unit is + // instantiated per SIMD + std::vector fetchUnit; + std::string _name; +}; + +#endif // __FETCH_STAGE_HH__ diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc new file mode 100644 index 000000000..1f0a7d78e --- /dev/null +++ b/src/gpu-compute/fetch_unit.cc @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Sooraj Puthoor + */ + +#include "gpu-compute/fetch_unit.hh" + +#include "debug/GPUFetch.hh" +#include "debug/GPUPort.hh" +#include "debug/GPUTLB.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/ruby/system/RubySystem.hh" + +uint32_t FetchUnit::globalFetchUnitID; + +FetchUnit::FetchUnit(const ComputeUnitParams* params) : + timingSim(true), + computeUnit(nullptr), + fetchScheduler(params), + waveList(nullptr) +{ +} + +FetchUnit::~FetchUnit() +{ + fetchQueue.clear(); + fetchStatusQueue.clear(); +} + +void +FetchUnit::init(ComputeUnit *cu) +{ + computeUnit = cu; + timingSim = computeUnit->shader->timingSim; + fetchQueue.clear(); + fetchStatusQueue.resize(computeUnit->shader->n_wf); + + for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); + } + + fetchScheduler.bindList(&fetchQueue); +} + +void +FetchUnit::exec() +{ + // re-evaluate waves which are marked as not ready for fetch + for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + // Following code assumes 64-bit opertaion and all insts are + // represented by 64-bit pointers to inst objects. + Wavefront *curWave = fetchStatusQueue[j].first; + assert (curWave); + + // The wavefront has to be active, the IB occupancy has to be + // 4 or less instructions and it can not have any branches to + // prevent speculative instruction fetches + if (!fetchStatusQueue[j].second) { + if (curWave->status == Wavefront::S_RUNNING && + curWave->instructionBuffer.size() <= 4 && + !curWave->instructionBufferHasBranch() && + !curWave->pendingFetch) { + fetchQueue.push_back(curWave); + fetchStatusQueue[j].second = true; + } + } + } + + // Fetch only if there is some wave ready to be fetched + // An empty fetchQueue will cause the schedular to panic + if (fetchQueue.size()) { + Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); + waveToBeFetched->pendingFetch = true; + fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; + initiateFetch(waveToBeFetched); + } +} + +void +FetchUnit::initiateFetch(Wavefront *wavefront) +{ + // calculate the virtual address to fetch from the SQC + Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size(); + vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); + + // Since this is an instruction prefetch, if you're split then just finish + // out the current line. + unsigned block_size = RubySystem::getBlockSizeBytes(); + // check for split accesses + Addr split_addr = roundDown(vaddr + block_size - 1, block_size); + unsigned size = block_size; + + if (split_addr > vaddr) { + // misaligned access, just grab the rest of the line + size = split_addr - vaddr; + } + + // set up virtual request + Request *req = new Request(0, vaddr, size, Request::INST_FETCH, + computeUnit->masterId(), 0, 0, 0); + + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + // This fetchBlock is kind of faux right now - because the translations so + // far don't actually return Data + uint64_t fetchBlock; + pkt->dataStatic(&fetchBlock); + + if (timingSim) { + // SenderState needed on Return + pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); + + // Sender State needed by TLB hierarchy + pkt->senderState = + new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, + computeUnit->shader->gpuTc, + false, pkt->senderState); + + if (computeUnit->sqcTLBPort->isStalled()) { + assert(computeUnit->sqcTLBPort->retries.size() > 0); + + DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", + vaddr); + + computeUnit->sqcTLBPort->retries.push_back(pkt); + } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { + // Stall the data port; + // No more packet is issued till + // ruby indicates resources are freed by + // a recvReqRetry() call back on this port. + computeUnit->sqcTLBPort->stallPort(); + + DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", + vaddr); + + computeUnit->sqcTLBPort->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); + } + } else { + pkt->senderState = + new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, + computeUnit->shader->gpuTc); + + computeUnit->sqcTLBPort->sendFunctional(pkt); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete sender_state; + // fetch the instructions from the SQC when we operate in + // functional mode only + fetch(pkt, wavefront); + } +} + +void +FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) +{ + assert(pkt->req->hasPaddr()); + assert(pkt->req->hasSize()); + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + + // this is necessary because the GPU TLB receives packets instead of + // requests. when the translation is complete, all relevent fields in the + // request will be populated, but not in the packet. here we create the + // new packet so we can set the size, addr, and proper flags. + PacketPtr oldPkt = pkt; + pkt = new Packet(oldPkt->req, oldPkt->cmd); + delete oldPkt; + + TheGpuISA::RawMachInst *data = + new TheGpuISA::RawMachInst[pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst)]; + + pkt->dataDynamic(data); + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); + + if (timingSim) { + // translation is done. Send the appropriate timing memory request. + + if (!computeUnit->sqcPort->sendTimingReq(pkt)) { + computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, + wavefront)); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + } + } else { + computeUnit->sqcPort->sendFunctional(pkt); + processFetchReturn(pkt); + } +} + +void +FetchUnit::processFetchReturn(PacketPtr pkt) +{ + ComputeUnit::SQCPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + Wavefront *wavefront = sender_state->wavefront; + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " + "%d bytes, %d instructions!\n", computeUnit->cu_id, + wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), + pkt->req->getSize(), pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst)); + + if (wavefront->dropFetch) { + assert(wavefront->instructionBuffer.empty()); + wavefront->dropFetch = false; + } else { + TheGpuISA::RawMachInst *inst_index_ptr = + (TheGpuISA::RawMachInst*)pkt->getPtr(); + + assert(wavefront->instructionBuffer.size() <= 4); + + for (int i = 0; i < pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst); ++i) { + GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); + + assert(inst_ptr); + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", + computeUnit->cu_id, wavefront->simdId, + wavefront->wfSlotId, inst_ptr->disassemble()); + + GPUDynInstPtr gpuDynInst = + std::make_shared(computeUnit, wavefront, inst_ptr, + computeUnit->getAndIncSeqNum()); + + wavefront->instructionBuffer.push_back(gpuDynInst); + } + } + + wavefront->pendingFetch = false; + + delete pkt->senderState; + delete pkt->req; + delete pkt; +} + +void +FetchUnit::bindWaveList(std::vector *wave_list) +{ + waveList = wave_list; +} diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh new file mode 100644 index 000000000..c7c6afb3c --- /dev/null +++ b/src/gpu-compute/fetch_unit.hh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Sooraj Puthoor + */ + +#ifndef __FETCH_UNIT_HH__ +#define __FETCH_UNIT_HH__ + +#include +#include +#include + +#include "arch/gpu_decoder.hh" +#include "base/statistics.hh" +#include "config/the_gpu_isa.hh" +#include "gpu-compute/scheduler.hh" +#include "mem/packet.hh" + +class ComputeUnit; +class Wavefront; + +class FetchUnit +{ + public: + FetchUnit(const ComputeUnitParams* params); + ~FetchUnit(); + void init(ComputeUnit *cu); + void exec(); + void bindWaveList(std::vector *list); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void processFetchReturn(PacketPtr pkt); + static uint32_t globalFetchUnitID; + + private: + bool timingSim; + ComputeUnit *computeUnit; + TheGpuISA::Decoder decoder; + + // Fetch scheduler; Selects one wave from + // the fetch queue for instruction fetching. + // The selection is made according to + // a scheduling policy + Scheduler fetchScheduler; + + // Stores the list of waves that are + // ready to be fetched this cycle + std::vector fetchQueue; + + // Stores the fetch status of all waves dispatched to this SIMD. + // TRUE implies the wave is ready to fetch and is already + // moved to fetchQueue + std::vector> fetchStatusQueue; + + // Pointer to list of waves dispatched on to this SIMD unit + std::vector *waveList; +}; + +#endif // __FETCH_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc new file mode 100644 index 000000000..913327412 --- /dev/null +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#include "gpu-compute/global_memory_pipeline.hh" + +#include "debug/GPUMem.hh" +#include "debug/GPUReg.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size), + inflightStores(0), inflightLoads(0) +{ +} + +void +GlobalMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + globalMemSize = computeUnit->shader->globalMemSize; + _name = computeUnit->name() + ".GlobalMemPipeline"; +} + +void +GlobalMemPipeline::exec() +{ + // apply any returned global memory operations + GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() : + !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; + + bool accessVrf = true; + // check the VRF to see if the operands of a load (or load component + // of an atomic) are accessible + if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + accessVrf = + w->computeUnit->vrf[m->simdId]-> + vrfOperandAccessReady(m->seqNum(), w, m, + VrfAccessType::WRITE); + } + + if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) && + m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && + accessVrf && m->statusBitVector == VectorMask(0) && + (computeUnit->shader->coissue_return || + computeUnit->wfWait.at(m->pipeId).rdy())) { + + if (m->v_type == VT_32 && m->m_type == Enums::M_U8) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) + doGmReturn(m); + } + + // If pipeline has executed a global memory instruction + // execute global memory packets and issue global + // memory packets to DTLB + if (!gmIssuedRequests.empty()) { + GPUDynInstPtr mp = gmIssuedRequests.front(); + if (mp->m_op == Enums::MO_LD || + (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || + (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { + + if (inflightLoads >= gmQueueSize) { + return; + } else { + ++inflightLoads; + } + } else { + if (inflightStores >= gmQueueSize) { + return; + } else { + ++inflightStores; + } + } + + mp->initiateAcc(mp); + gmIssuedRequests.pop(); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId, + Enums::MemOpTypeStrings[mp->m_op]); + } +} + +template +void +GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) +{ + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + // Return data to registers + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + gmReturnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + std::vector regVec; + // iterate over number of destination register operands since + // this is a load or atomic operation + for (int k = 0; k < m->n_reg; ++k) { + assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST); + int dst = m->dst_reg + k; + + if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = m->dst_reg_vec[k]; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst, sizeof(c0), 1); + // save the physical VGPR index + regVec.push_back(physVgpr); + c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (m->exec_mask[i]) { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " + "$%s%d <- %d global ld done (src = wavefront " + "ld inst)\n", w->computeUnit->cu_id, w->simdId, + w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", + dst, *p1); + // write the value into the physical VGPR. This is a + // purely functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. + // This simply models the timing aspect of the VRF write operation. + // It does not modify the physical VGPR. + loadVrfBankConflictCycles += + w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), + w, regVec, sizeof(c0), + m->time); + } + } else { + gmReturnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1); + + if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || + MO_H(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time, + -1); + } + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time, + -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->glbMemToVrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); +} + +void +GlobalMemPipeline::regStats() +{ + loadVrfBankConflictCycles + .name(name() + ".load_vrf_bank_conflict_cycles") + .desc("total number of cycles GM data are delayed before updating " + "the VRF") + ; +} diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh new file mode 100644 index 000000000..ed49f6f6b --- /dev/null +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#ifndef __GLOBAL_MEMORY_PIPELINE_HH__ +#define __GLOBAL_MEMORY_PIPELINE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file global_memory_pipeline.hh + * + * The global memory pipeline issues newly created global memory packets + * from the pipeline to DTLB. The exec() method of the memory packet issues + * the packet to the DTLB if there is space available in the return fifo. + * This stage also retires previously issued loads and stores that have + * returned from the memory sub-system. + */ + +class ComputeUnit; + +class GlobalMemPipeline +{ + public: + GlobalMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + template void doGmReturn(GPUDynInstPtr m); + + std::queue &getGMReqFIFO() { return gmIssuedRequests; } + std::queue &getGMStRespFIFO() { return gmReturnedStores; } + std::queue &getGMLdRespFIFO() { return gmReturnedLoads; } + + bool + isGMLdRespFIFOWrRdy() const + { + return gmReturnedLoads.size() < gmQueueSize; + } + + bool + isGMStRespFIFOWrRdy() const + { + return gmReturnedStores.size() < gmQueueSize; + } + + bool + isGMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (gmIssuedRequests.size() + pendReqs) < gmQueueSize; + } + + const std::string &name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int gmQueueSize; + + // number of cycles of delaying the update of a VGPR that is the + // target of a load instruction (or the load component of an atomic) + // The delay is due to VRF bank conflicts + Stats::Scalar loadVrfBankConflictCycles; + // Counters to track the inflight loads and stores + // so that we can provide the proper backpressure + // on the number of inflight memory operations. + int inflightStores; + int inflightLoads; + + // The size of global memory. + int globalMemSize; + + // Global Memory Request FIFO: all global memory requests + // are issued to this FIFO from the memory pipelines + std::queue gmIssuedRequests; + + // Globa Store Response FIFO: all responses of global memory + // stores are sent to this FIFO from TCP + std::queue gmReturnedStores; + + // Global Load Response FIFO: all responses of global memory + // loads are sent to this FIFO from TCP + std::queue gmReturnedLoads; +}; + +#endif // __GLOBAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc new file mode 100644 index 000000000..83e348dbe --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_dyn_inst.hh" + +#include "debug/GPUMem.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, + GPUStaticInst *_staticInst, uint64_t instSeqNum) + : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF), + memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false), + statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) +{ + tlbHitLevel.assign(VSZ, -1); +} + +void +GPUDynInst::execute() +{ + GPUDynInstPtr gpuDynInst = std::make_shared(cu, wf, staticInst, + _seqNum); + staticInst->execute(gpuDynInst); +} + +int +GPUDynInst::numSrcRegOperands() +{ + return staticInst->numSrcRegOperands(); +} + +int +GPUDynInst::numDstRegOperands() +{ + return staticInst->numDstRegOperands(); +} + +int +GPUDynInst::getNumOperands() +{ + return staticInst->getNumOperands(); +} + +bool +GPUDynInst::isVectorRegister(int operandIdx) +{ + return staticInst->isVectorRegister(operandIdx); +} + +bool +GPUDynInst::isScalarRegister(int operandIdx) +{ + return staticInst->isVectorRegister(operandIdx); +} + +int +GPUDynInst::getRegisterIndex(int operandIdx) +{ + return staticInst->getRegisterIndex(operandIdx); +} + +int +GPUDynInst::getOperandSize(int operandIdx) +{ + return staticInst->getOperandSize(operandIdx); +} + +bool +GPUDynInst::isDstOperand(int operandIdx) +{ + return staticInst->isDstOperand(operandIdx); +} + +bool +GPUDynInst::isSrcOperand(int operandIdx) +{ + return staticInst->isSrcOperand(operandIdx); +} + +bool +GPUDynInst::isArgLoad() +{ + return staticInst->isArgLoad(); +} + +const std::string& +GPUDynInst::disassemble() const +{ + return staticInst->disassemble(); +} + +uint64_t +GPUDynInst::seqNum() const +{ + return _seqNum; +} + +Enums::OpType +GPUDynInst::opType() +{ + return staticInst->o_type; +} + +Enums::StorageClassType +GPUDynInst::executedAs() +{ + return staticInst->executed_as; +} + +// Process a memory instruction and (if necessary) submit timing request +void +GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) +{ + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", + cu->cu_id, simdId, wfSlotId, exec_mask); + + staticInst->initiateAcc(gpuDynInst); + time = 0; +} + +bool +GPUDynInst::scalarOp() const +{ + return staticInst->scalarOp(); +} + +void +GPUDynInst::updateStats() +{ + if (staticInst->isLocalMem()) { + // access to LDS (shared) memory + cu->dynamicLMemInstrCnt++; + } else { + // access to global memory + + // update PageDivergence histogram + int number_pages_touched = cu->pagesTouched.size(); + assert(number_pages_touched); + cu->pageDivergenceDist.sample(number_pages_touched); + + std::pair ret; + + for (auto it : cu->pagesTouched) { + // see if this page has been touched before. if not, this also + // inserts the page into the table. + ret = cu->pageAccesses + .insert(ComputeUnit::pageDataStruct::value_type(it.first, + std::make_pair(1, it.second))); + + // if yes, then update the stats + if (!ret.second) { + ret.first->second.first++; + ret.first->second.second += it.second; + } + } + + cu->pagesTouched.clear(); + + // total number of memory instructions (dynamic) + // Atomics are counted as a single memory instruction. + // this is # memory instructions per wavefronts, not per workitem + cu->dynamicGMemInstrCnt++; + } +} diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh new file mode 100644 index 000000000..e44d8f80d --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_DYN_INST_HH__ +#define __GPU_DYN_INST_HH__ + +#include +#include + +#include "enums/GenericMemoryOrder.hh" +#include "enums/GenericMemoryScope.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_exec_context.hh" + +class GPUStaticInst; + +template +class AtomicOpAnd : public TypedAtomicOpFunctor +{ + public: + T a; + + AtomicOpAnd(T _a) : a(_a) { } + void execute(T *b) { *b &= a; } +}; + +template +class AtomicOpOr : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpOr(T _a) : a(_a) { } + void execute(T *b) { *b |= a; } +}; + +template +class AtomicOpXor : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpXor(T _a) : a(_a) {} + void execute(T *b) { *b ^= a; } +}; + +template +class AtomicOpCAS : public TypedAtomicOpFunctor +{ + public: + T c; + T s; + + ComputeUnit *computeUnit; + + AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) + : c(_c), s(_s), computeUnit(compute_unit) { } + + void + execute(T *b) + { + computeUnit->numCASOps++; + + if (*b == c) { + *b = s; + } else { + computeUnit->numFailedCASOps++; + } + + if (computeUnit->xact_cas_mode) { + computeUnit->xactCasLoadMap.clear(); + } + } +}; + +template +class AtomicOpExch : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpExch(T _a) : a(_a) { } + void execute(T *b) { *b = a; } +}; + +template +class AtomicOpAdd : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpAdd(T _a) : a(_a) { } + void execute(T *b) { *b += a; } +}; + +template +class AtomicOpSub : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpSub(T _a) : a(_a) { } + void execute(T *b) { *b -= a; } +}; + +template +class AtomicOpInc : public TypedAtomicOpFunctor +{ + public: + AtomicOpInc() { } + void execute(T *b) { *b += 1; } +}; + +template +class AtomicOpDec : public TypedAtomicOpFunctor +{ + public: + AtomicOpDec() {} + void execute(T *b) { *b -= 1; } +}; + +template +class AtomicOpMax : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpMax(T _a) : a(_a) { } + + void + execute(T *b) + { + if (a > *b) + *b = a; + } +}; + +template +class AtomicOpMin : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpMin(T _a) : a(_a) {} + + void + execute(T *b) + { + if (a < *b) + *b = a; + } +}; + +#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) +#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) +#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) + +typedef enum +{ + VT_32, + VT_64, +} vgpr_type; + +typedef enum +{ + SEG_PRIVATE, + SEG_SPILL, + SEG_GLOBAL, + SEG_SHARED, + SEG_READONLY, + SEG_FLAT +} seg_type; + +class GPUDynInst : public GPUExecContext +{ + public: + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + uint64_t instSeqNum); + + void execute(); + int numSrcRegOperands(); + int numDstRegOperands(); + int getNumOperands(); + bool isVectorRegister(int operandIdx); + bool isScalarRegister(int operandIdx); + int getRegisterIndex(int operandIdx); + int getOperandSize(int operandIdx); + bool isDstOperand(int operandIdx); + bool isSrcOperand(int operandIdx); + bool isArgLoad(); + + const std::string &disassemble() const; + + uint64_t seqNum() const; + + Enums::OpType opType(); + Enums::StorageClassType executedAs(); + + // The address of the memory operation + Addr addr[VSZ]; + Addr pAddr; + + // The data to get written + uint8_t d_data[VSZ * 16]; + // Additional data (for atomics) + uint8_t a_data[VSZ * 8]; + // Additional data (for atomics) + uint8_t x_data[VSZ * 8]; + // The execution mask + VectorMask exec_mask; + + // The memory type (M_U32, M_S32, ...) + Enums::MemType m_type; + // The memory operation (MO_LD, MO_ST, ...) + Enums::MemOpType m_op; + Enums::GenericMemoryOrder memoryOrder; + + // Scope of the request + Enums::GenericMemoryScope scope; + // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) + seg_type s_type; + // The equivalency class + int equiv; + // The return VGPR type (VT_32 or VT_64) + vgpr_type v_type; + // Number of VGPR's accessed (1, 2, or 4) + int n_reg; + // The return VGPR index + int dst_reg; + // There can be max 4 dest regs> + int dst_reg_vec[4]; + // SIMD where the WF of the memory instruction has been mapped to + int simdId; + // unique id of the WF where the memory instruction belongs to + int wfDynId; + // The kernel id of the requesting wf + int kern_id; + // The CU id of the requesting wf + int cu_id; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + // execution pipeline id where the memory instruction has been scheduled + int pipeId; + // The execution time of this operation + Tick time; + // The latency of this operation + WaitClass latency; + // A list of bank conflicts for the 4 cycles. + uint32_t bc[4]; + + // A pointer to ROM + uint8_t *rom; + // The size of the READONLY segment + int sz_rom; + + // Initiate the specified memory operation, by creating a + // memory request and sending it off to the memory system. + void initiateAcc(GPUDynInstPtr gpuDynInst); + + void updateStats(); + + GPUStaticInst* staticInstruction() { return staticInst; } + + // Is the instruction a scalar or vector op? + bool scalarOp() const; + + /* + * Loads/stores/atomics may have acquire/release semantics associated + * withthem. Some protocols want to see the acquire/release as separate + * requests from the load/store/atomic. We implement that separation + * using continuations (i.e., a function pointer with an object associated + * with it). When, for example, the front-end generates a store with + * release semantics, we will first issue a normal store and set the + * continuation in the GPUDynInst to a function that generate a + * release request. That continuation will be called when the normal + * store completes (in ComputeUnit::DataPort::recvTimingResponse). The + * continuation will be called in the context of the same GPUDynInst + * that generated the initial store. + */ + std::function execContinuation; + + // when true, call execContinuation when response arrives + bool useContinuation; + + template AtomicOpFunctor* + makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + { + using namespace Enums; + + switch(op) { + case MO_AAND: + case MO_ANRAND: + return new AtomicOpAnd(*reg0); + case MO_AOR: + case MO_ANROR: + return new AtomicOpOr(*reg0); + case MO_AXOR: + case MO_ANRXOR: + return new AtomicOpXor(*reg0); + case MO_ACAS: + case MO_ANRCAS: + return new AtomicOpCAS(*reg0, *reg1, cu); + case MO_AEXCH: + case MO_ANREXCH: + return new AtomicOpExch(*reg0); + case MO_AADD: + case MO_ANRADD: + return new AtomicOpAdd(*reg0); + case MO_ASUB: + case MO_ANRSUB: + return new AtomicOpSub(*reg0); + case MO_AINC: + case MO_ANRINC: + return new AtomicOpInc(); + case MO_ADEC: + case MO_ANRDEC: + return new AtomicOpDec(); + case MO_AMAX: + case MO_ANRMAX: + return new AtomicOpMax(*reg0); + case MO_AMIN: + case MO_ANRMIN: + return new AtomicOpMin(*reg0); + default: + panic("Unrecognized atomic operation"); + } + } + + void + setRequestFlags(Request *req, bool setMemOrder=true) + { + // currently these are the easy scopes to deduce + switch (s_type) { + case SEG_PRIVATE: + req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); + break; + case SEG_SPILL: + req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); + break; + case SEG_GLOBAL: + req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); + break; + case SEG_READONLY: + req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); + break; + case SEG_SHARED: + req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); + break; + case SEG_FLAT: + // TODO: translate to correct scope + assert(false); + default: + panic("Bad segment type"); + break; + } + + switch (scope) { + case Enums::MEMORY_SCOPE_NONE: + case Enums::MEMORY_SCOPE_WORKITEM: + break; + case Enums::MEMORY_SCOPE_WAVEFRONT: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WAVEFRONT_SCOPE); + break; + case Enums::MEMORY_SCOPE_WORKGROUP: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WORKGROUP_SCOPE); + break; + case Enums::MEMORY_SCOPE_DEVICE: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::DEVICE_SCOPE); + break; + case Enums::MEMORY_SCOPE_SYSTEM: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::SYSTEM_SCOPE); + break; + default: + panic("Bad scope type"); + break; + } + + if (setMemOrder) { + // set acquire and release flags + switch (memoryOrder){ + case Enums::MEMORY_ORDER_SC_ACQUIRE: + req->setFlags(Request::ACQUIRE); + break; + case Enums::MEMORY_ORDER_SC_RELEASE: + req->setFlags(Request::RELEASE); + break; + case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + req->setFlags(Request::ACQUIRE | Request::RELEASE); + break; + default: + break; + } + } + + // set atomic type + // currently, the instruction genenerator only produces atomic return + // but a magic instruction can produce atomic no return + if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || + m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || + m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || + m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || + m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || + m_op == Enums::MO_ACAS) { + req->setFlags(Request::ATOMIC_RETURN_OP); + } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || + m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || + m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || + m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || + m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || + m_op == Enums::MO_ANRCAS) { + req->setFlags(Request::ATOMIC_NO_RETURN_OP); + } + } + + // Map returned packets and the addresses they satisfy with which lane they + // were requested from + typedef std::unordered_map> StatusVector; + StatusVector memStatusVector; + + // Track the status of memory requests per lane, a bit per lane + VectorMask statusBitVector; + // for ld_v# or st_v# + std::vector statusVector; + std::vector tlbHitLevel; + + private: + GPUStaticInst *staticInst; + uint64_t _seqNum; +}; + +#endif // __GPU_DYN_INST_HH__ diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc new file mode 100644 index 000000000..4af69c41e --- /dev/null +++ b/src/gpu-compute/gpu_exec_context.cc @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_exec_context.hh" + +GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf) + : cu(_cu), wf(_wf) +{ +} + +ComputeUnit* +GPUExecContext::computeUnit() +{ + return cu; +} + +Wavefront* +GPUExecContext::wavefront() +{ + return wf; +} diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh new file mode 100644 index 000000000..a3deb9b8f --- /dev/null +++ b/src/gpu-compute/gpu_exec_context.hh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_EXEC_CONTEXT_HH__ +#define __GPU_EXEC_CONTEXT_HH__ + +class ComputeUnit; +class Wavefront; + +class GPUExecContext +{ + public: + GPUExecContext(ComputeUnit *_cu, Wavefront *_wf); + Wavefront* wavefront(); + ComputeUnit* computeUnit(); + + protected: + ComputeUnit *cu; + Wavefront *wf; +}; + +#endif // __GPU_EXEC_CONTEXT_HH__ diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc new file mode 100644 index 000000000..bcb8a5f3d --- /dev/null +++ b/src/gpu-compute/gpu_static_inst.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_static_inst.hh" + +GPUStaticInst::GPUStaticInst(const std::string &opcode) + : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0), _scalarOp(false) +{ +} diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh new file mode 100644 index 000000000..c1de28427 --- /dev/null +++ b/src/gpu-compute/gpu_static_inst.hh @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_STATIC_INST_HH__ +#define __GPU_STATIC_INST_HH__ + +/* + * @file gpu_static_inst.hh + * + * Defines the base class representing static instructions for the GPU. The + * instructions are "static" because they contain no dynamic instruction + * information. GPUStaticInst corresponds to the StaticInst class for the CPU + * models. + */ + +#include +#include + +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/misc.hh" + +class BaseOperand; +class BaseRegOperand; +class Wavefront; + +class GPUStaticInst +{ + public: + GPUStaticInst(const std::string &opcode); + + void instNum(int num) { _instNum = num; } + + int instNum() { return _instNum; } + + void ipdInstNum(int num) { _ipdInstNum = num; } + + int ipdInstNum() const { return _ipdInstNum; } + + virtual void execute(GPUDynInstPtr gpuDynInst) = 0; + virtual void generateDisassembly() = 0; + virtual const std::string &disassemble() = 0; + virtual int getNumOperands() = 0; + virtual bool isCondRegister(int operandIndex) = 0; + virtual bool isScalarRegister(int operandIndex) = 0; + virtual bool isVectorRegister(int operandIndex) = 0; + virtual bool isSrcOperand(int operandIndex) = 0; + virtual bool isDstOperand(int operandIndex) = 0; + virtual int getOperandSize(int operandIndex) = 0; + virtual int getRegisterIndex(int operandIndex) = 0; + virtual int numDstRegOperands() = 0; + virtual int numSrcRegOperands() = 0; + + /* + * Most instructions (including all HSAIL instructions) + * are vector ops, so _scalarOp will be false by default. + * Derived instruction objects that are scalar ops must + * set _scalarOp to true in their constructors. + */ + bool scalarOp() const { return _scalarOp; } + + virtual bool isLocalMem() const + { + fatal("calling isLocalMem() on non-memory instruction.\n"); + + return false; + } + + bool isArgLoad() { return false; } + virtual uint32_t instSize() = 0; + + // only used for memory instructions + virtual void + initiateAcc(GPUDynInstPtr gpuDynInst) + { + fatal("calling initiateAcc() on a non-memory instruction.\n"); + } + + virtual uint32_t getTargetPc() { return 0; } + + /** + * Query whether the instruction is an unconditional jump i.e., the jump + * is always executed because there is no condition to be evaluated. + * + * If the instruction is not of branch type, the result is always false. + * + * @return True if the instruction is an unconditional jump. + */ + virtual bool unconditionalJumpInstruction() { return false; } + + static uint64_t dynamic_id_count; + + Enums::OpType o_type; + // For flat memory accesses + Enums::StorageClassType executed_as; + + protected: + virtual void + execLdAcq(GPUDynInstPtr gpuDynInst) + { + fatal("calling execLdAcq() on a non-load instruction.\n"); + } + + virtual void + execSt(GPUDynInstPtr gpuDynInst) + { + fatal("calling execLdAcq() on a non-load instruction.\n"); + } + + virtual void + execAtomic(GPUDynInstPtr gpuDynInst) + { + fatal("calling execAtomic() on a non-atomic instruction.\n"); + } + + virtual void + execAtomicAcq(GPUDynInstPtr gpuDynInst) + { + fatal("calling execAtomicAcq() on a non-atomic instruction.\n"); + } + + const std::string opcode; + std::string disassembly; + int _instNum; + /** + * Identifier of the immediate post-dominator instruction. + */ + int _ipdInstNum; + + bool _scalarOp; +}; + +#endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc new file mode 100644 index 000000000..de005fd04 --- /dev/null +++ b/src/gpu-compute/gpu_tlb.cc @@ -0,0 +1,1801 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/gpu_tlb.hh" + +#include +#include + +#include "arch/x86/faults.hh" +#include "arch/x86/insts/microldstop.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/misc.hh" +#include "arch/x86/x86_traits.hh" +#include "base/bitfield.hh" +#include "base/output.hh" +#include "base/trace.hh" +#include "cpu/base.hh" +#include "cpu/thread_context.hh" +#include "debug/GPUPrefetch.hh" +#include "debug/GPUTLB.hh" +#include "mem/packet_access.hh" +#include "mem/page_table.hh" +#include "mem/request.hh" +#include "sim/process.hh" + +namespace X86ISA +{ + + GpuTLB::GpuTLB(const Params *p) + : MemObject(p), configAddress(0), size(p->size), + cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this) + { + assoc = p->assoc; + assert(assoc <= size); + numSets = size/assoc; + allocationPolicy = p->allocationPolicy; + hasMemSidePort = false; + accessDistance = p->accessDistance; + clock = p->clk_domain->clockPeriod(); + + tlb = new GpuTlbEntry[size]; + std::memset(tlb, 0, sizeof(GpuTlbEntry) * size); + + freeList.resize(numSets); + entryList.resize(numSets); + + for (int set = 0; set < numSets; ++set) { + for (int way = 0; way < assoc; ++way) { + int x = set*assoc + way; + freeList[set].push_back(&tlb[x]); + } + } + + FA = (size == assoc); + + /** + * @warning: the set-associative version assumes you have a + * fixed page size of 4KB. + * If the page size is greather than 4KB (as defined in the + * TheISA::PageBytes), then there are various issues w/ the current + * implementation (you'd have the same 8KB page being replicated in + * different sets etc) + */ + setMask = numSets - 1; + + #if 0 + // GpuTLB doesn't yet support full system + walker = p->walker; + walker->setTLB(this); + #endif + + maxCoalescedReqs = p->maxOutstandingReqs; + + // Do not allow maxCoalescedReqs to be more than the TLB associativity + if (maxCoalescedReqs > assoc) { + maxCoalescedReqs = assoc; + cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc); + } + + outstandingReqs = 0; + hitLatency = p->hitLatency; + missLatency1 = p->missLatency1; + missLatency2 = p->missLatency2; + + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", + name(), i), this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", + name(), i), this, i)); + } + } + + // fixme: this is never called? + GpuTLB::~GpuTLB() + { + // make sure all the hash-maps are empty + assert(translationReturnEvent.empty()); + + // delete the TLB + delete[] tlb; + } + + BaseSlavePort& + GpuTLB::getSlavePort(const std::string &if_name, PortID idx) + { + if (if_name == "slave") { + if (idx >= static_cast(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } + } + + BaseMasterPort& + GpuTLB::getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "master") { + if (idx >= static_cast(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + hasMemSidePort = true; + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } + } + + GpuTlbEntry* + GpuTLB::insert(Addr vpn, GpuTlbEntry &entry) + { + GpuTlbEntry *newEntry = nullptr; + + /** + * vpn holds the virtual page address + * The least significant bits are simply masked + */ + int set = (vpn >> TheISA::PageShift) & setMask; + + if (!freeList[set].empty()) { + newEntry = freeList[set].front(); + freeList[set].pop_front(); + } else { + newEntry = entryList[set].back(); + entryList[set].pop_back(); + } + + *newEntry = entry; + newEntry->vaddr = vpn; + entryList[set].push_front(newEntry); + + return newEntry; + } + + GpuTLB::EntryList::iterator + GpuTLB::lookupIt(Addr va, bool update_lru) + { + int set = (va >> TheISA::PageShift) & setMask; + + if (FA) { + assert(!set); + } + + auto entry = entryList[set].begin(); + for (; entry != entryList[set].end(); ++entry) { + int page_size = (*entry)->size(); + + if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) { + DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x " + "with size %#x.\n", va, (*entry)->vaddr, page_size); + + if (update_lru) { + entryList[set].push_front(*entry); + entryList[set].erase(entry); + entry = entryList[set].begin(); + } + + break; + } + } + + return entry; + } + + GpuTlbEntry* + GpuTLB::lookup(Addr va, bool update_lru) + { + int set = (va >> TheISA::PageShift) & setMask; + + auto entry = lookupIt(va, update_lru); + + if (entry == entryList[set].end()) + return nullptr; + else + return *entry; + } + + void + GpuTLB::invalidateAll() + { + DPRINTF(GPUTLB, "Invalidating all entries.\n"); + + for (int i = 0; i < numSets; ++i) { + while (!entryList[i].empty()) { + GpuTlbEntry *entry = entryList[i].front(); + entryList[i].pop_front(); + freeList[i].push_back(entry); + } + } + } + + void + GpuTLB::setConfigAddress(uint32_t addr) + { + configAddress = addr; + } + + void + GpuTLB::invalidateNonGlobal() + { + DPRINTF(GPUTLB, "Invalidating all non global entries.\n"); + + for (int i = 0; i < numSets; ++i) { + for (auto entryIt = entryList[i].begin(); + entryIt != entryList[i].end();) { + if (!(*entryIt)->global) { + freeList[i].push_back(*entryIt); + entryList[i].erase(entryIt++); + } else { + ++entryIt; + } + } + } + } + + void + GpuTLB::demapPage(Addr va, uint64_t asn) + { + + int set = (va >> TheISA::PageShift) & setMask; + auto entry = lookupIt(va, false); + + if (entry != entryList[set].end()) { + freeList[set].push_back(*entry); + entryList[set].erase(entry); + } + } + + Fault + GpuTLB::translateInt(RequestPtr req, ThreadContext *tc) + { + DPRINTF(GPUTLB, "Addresses references internal memory.\n"); + Addr vaddr = req->getVaddr(); + Addr prefix = (vaddr >> 3) & IntAddrPrefixMask; + + if (prefix == IntAddrPrefixCPUID) { + panic("CPUID memory space not yet implemented!\n"); + } else if (prefix == IntAddrPrefixMSR) { + vaddr = vaddr >> 3; + req->setFlags(Request::MMAPPED_IPR); + Addr regNum = 0; + + switch (vaddr & ~IntAddrPrefixMask) { + case 0x10: + regNum = MISCREG_TSC; + break; + case 0x1B: + regNum = MISCREG_APIC_BASE; + break; + case 0xFE: + regNum = MISCREG_MTRRCAP; + break; + case 0x174: + regNum = MISCREG_SYSENTER_CS; + break; + case 0x175: + regNum = MISCREG_SYSENTER_ESP; + break; + case 0x176: + regNum = MISCREG_SYSENTER_EIP; + break; + case 0x179: + regNum = MISCREG_MCG_CAP; + break; + case 0x17A: + regNum = MISCREG_MCG_STATUS; + break; + case 0x17B: + regNum = MISCREG_MCG_CTL; + break; + case 0x1D9: + regNum = MISCREG_DEBUG_CTL_MSR; + break; + case 0x1DB: + regNum = MISCREG_LAST_BRANCH_FROM_IP; + break; + case 0x1DC: + regNum = MISCREG_LAST_BRANCH_TO_IP; + break; + case 0x1DD: + regNum = MISCREG_LAST_EXCEPTION_FROM_IP; + break; + case 0x1DE: + regNum = MISCREG_LAST_EXCEPTION_TO_IP; + break; + case 0x200: + regNum = MISCREG_MTRR_PHYS_BASE_0; + break; + case 0x201: + regNum = MISCREG_MTRR_PHYS_MASK_0; + break; + case 0x202: + regNum = MISCREG_MTRR_PHYS_BASE_1; + break; + case 0x203: + regNum = MISCREG_MTRR_PHYS_MASK_1; + break; + case 0x204: + regNum = MISCREG_MTRR_PHYS_BASE_2; + break; + case 0x205: + regNum = MISCREG_MTRR_PHYS_MASK_2; + break; + case 0x206: + regNum = MISCREG_MTRR_PHYS_BASE_3; + break; + case 0x207: + regNum = MISCREG_MTRR_PHYS_MASK_3; + break; + case 0x208: + regNum = MISCREG_MTRR_PHYS_BASE_4; + break; + case 0x209: + regNum = MISCREG_MTRR_PHYS_MASK_4; + break; + case 0x20A: + regNum = MISCREG_MTRR_PHYS_BASE_5; + break; + case 0x20B: + regNum = MISCREG_MTRR_PHYS_MASK_5; + break; + case 0x20C: + regNum = MISCREG_MTRR_PHYS_BASE_6; + break; + case 0x20D: + regNum = MISCREG_MTRR_PHYS_MASK_6; + break; + case 0x20E: + regNum = MISCREG_MTRR_PHYS_BASE_7; + break; + case 0x20F: + regNum = MISCREG_MTRR_PHYS_MASK_7; + break; + case 0x250: + regNum = MISCREG_MTRR_FIX_64K_00000; + break; + case 0x258: + regNum = MISCREG_MTRR_FIX_16K_80000; + break; + case 0x259: + regNum = MISCREG_MTRR_FIX_16K_A0000; + break; + case 0x268: + regNum = MISCREG_MTRR_FIX_4K_C0000; + break; + case 0x269: + regNum = MISCREG_MTRR_FIX_4K_C8000; + break; + case 0x26A: + regNum = MISCREG_MTRR_FIX_4K_D0000; + break; + case 0x26B: + regNum = MISCREG_MTRR_FIX_4K_D8000; + break; + case 0x26C: + regNum = MISCREG_MTRR_FIX_4K_E0000; + break; + case 0x26D: + regNum = MISCREG_MTRR_FIX_4K_E8000; + break; + case 0x26E: + regNum = MISCREG_MTRR_FIX_4K_F0000; + break; + case 0x26F: + regNum = MISCREG_MTRR_FIX_4K_F8000; + break; + case 0x277: + regNum = MISCREG_PAT; + break; + case 0x2FF: + regNum = MISCREG_DEF_TYPE; + break; + case 0x400: + regNum = MISCREG_MC0_CTL; + break; + case 0x404: + regNum = MISCREG_MC1_CTL; + break; + case 0x408: + regNum = MISCREG_MC2_CTL; + break; + case 0x40C: + regNum = MISCREG_MC3_CTL; + break; + case 0x410: + regNum = MISCREG_MC4_CTL; + break; + case 0x414: + regNum = MISCREG_MC5_CTL; + break; + case 0x418: + regNum = MISCREG_MC6_CTL; + break; + case 0x41C: + regNum = MISCREG_MC7_CTL; + break; + case 0x401: + regNum = MISCREG_MC0_STATUS; + break; + case 0x405: + regNum = MISCREG_MC1_STATUS; + break; + case 0x409: + regNum = MISCREG_MC2_STATUS; + break; + case 0x40D: + regNum = MISCREG_MC3_STATUS; + break; + case 0x411: + regNum = MISCREG_MC4_STATUS; + break; + case 0x415: + regNum = MISCREG_MC5_STATUS; + break; + case 0x419: + regNum = MISCREG_MC6_STATUS; + break; + case 0x41D: + regNum = MISCREG_MC7_STATUS; + break; + case 0x402: + regNum = MISCREG_MC0_ADDR; + break; + case 0x406: + regNum = MISCREG_MC1_ADDR; + break; + case 0x40A: + regNum = MISCREG_MC2_ADDR; + break; + case 0x40E: + regNum = MISCREG_MC3_ADDR; + break; + case 0x412: + regNum = MISCREG_MC4_ADDR; + break; + case 0x416: + regNum = MISCREG_MC5_ADDR; + break; + case 0x41A: + regNum = MISCREG_MC6_ADDR; + break; + case 0x41E: + regNum = MISCREG_MC7_ADDR; + break; + case 0x403: + regNum = MISCREG_MC0_MISC; + break; + case 0x407: + regNum = MISCREG_MC1_MISC; + break; + case 0x40B: + regNum = MISCREG_MC2_MISC; + break; + case 0x40F: + regNum = MISCREG_MC3_MISC; + break; + case 0x413: + regNum = MISCREG_MC4_MISC; + break; + case 0x417: + regNum = MISCREG_MC5_MISC; + break; + case 0x41B: + regNum = MISCREG_MC6_MISC; + break; + case 0x41F: + regNum = MISCREG_MC7_MISC; + break; + case 0xC0000080: + regNum = MISCREG_EFER; + break; + case 0xC0000081: + regNum = MISCREG_STAR; + break; + case 0xC0000082: + regNum = MISCREG_LSTAR; + break; + case 0xC0000083: + regNum = MISCREG_CSTAR; + break; + case 0xC0000084: + regNum = MISCREG_SF_MASK; + break; + case 0xC0000100: + regNum = MISCREG_FS_BASE; + break; + case 0xC0000101: + regNum = MISCREG_GS_BASE; + break; + case 0xC0000102: + regNum = MISCREG_KERNEL_GS_BASE; + break; + case 0xC0000103: + regNum = MISCREG_TSC_AUX; + break; + case 0xC0010000: + regNum = MISCREG_PERF_EVT_SEL0; + break; + case 0xC0010001: + regNum = MISCREG_PERF_EVT_SEL1; + break; + case 0xC0010002: + regNum = MISCREG_PERF_EVT_SEL2; + break; + case 0xC0010003: + regNum = MISCREG_PERF_EVT_SEL3; + break; + case 0xC0010004: + regNum = MISCREG_PERF_EVT_CTR0; + break; + case 0xC0010005: + regNum = MISCREG_PERF_EVT_CTR1; + break; + case 0xC0010006: + regNum = MISCREG_PERF_EVT_CTR2; + break; + case 0xC0010007: + regNum = MISCREG_PERF_EVT_CTR3; + break; + case 0xC0010010: + regNum = MISCREG_SYSCFG; + break; + case 0xC0010016: + regNum = MISCREG_IORR_BASE0; + break; + case 0xC0010017: + regNum = MISCREG_IORR_BASE1; + break; + case 0xC0010018: + regNum = MISCREG_IORR_MASK0; + break; + case 0xC0010019: + regNum = MISCREG_IORR_MASK1; + break; + case 0xC001001A: + regNum = MISCREG_TOP_MEM; + break; + case 0xC001001D: + regNum = MISCREG_TOP_MEM2; + break; + case 0xC0010114: + regNum = MISCREG_VM_CR; + break; + case 0xC0010115: + regNum = MISCREG_IGNNE; + break; + case 0xC0010116: + regNum = MISCREG_SMM_CTL; + break; + case 0xC0010117: + regNum = MISCREG_VM_HSAVE_PA; + break; + default: + return std::make_shared(0); + } + //The index is multiplied by the size of a MiscReg so that + //any memory dependence calculations will not see these as + //overlapping. + req->setPaddr(regNum * sizeof(MiscReg)); + return NoFault; + } else if (prefix == IntAddrPrefixIO) { + // TODO If CPL > IOPL or in virtual mode, check the I/O permission + // bitmap in the TSS. + + Addr IOPort = vaddr & ~IntAddrPrefixMask; + // Make sure the address fits in the expected 16 bit IO address + // space. + assert(!(IOPort & ~0xFFFF)); + + if (IOPort == 0xCF8 && req->getSize() == 4) { + req->setFlags(Request::MMAPPED_IPR); + req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg)); + } else if ((IOPort & ~mask(2)) == 0xCFC) { + req->setFlags(Request::UNCACHEABLE); + + Addr configAddress = + tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS); + + if (bits(configAddress, 31, 31)) { + req->setPaddr(PhysAddrPrefixPciConfig | + mbits(configAddress, 30, 2) | + (IOPort & mask(2))); + } else { + req->setPaddr(PhysAddrPrefixIO | IOPort); + } + } else { + req->setFlags(Request::UNCACHEABLE); + req->setPaddr(PhysAddrPrefixIO | IOPort); + } + return NoFault; + } else { + panic("Access to unrecognized internal address space %#x.\n", + prefix); + } + } + + /** + * TLB_lookup will only perform a TLB lookup returning true on a TLB hit + * and false on a TLB miss. + * Many of the checks about different modes have been converted to + * assertions, since these parts of the code are not really used. + * On a hit it will update the LRU stack. + */ + bool + GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats) + { + bool tlb_hit = false; + #ifndef NDEBUG + uint32_t flags = req->getFlags(); + int seg = flags & SegmentFlagMask; + #endif + + assert(seg != SEGMENT_REG_MS); + Addr vaddr = req->getVaddr(); + DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr); + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + + if (m5Reg.prot) { + DPRINTF(GPUTLB, "In protected mode.\n"); + // make sure we are in 64-bit mode + assert(m5Reg.mode == LongMode); + + // If paging is enabled, do the translation. + if (m5Reg.paging) { + DPRINTF(GPUTLB, "Paging enabled.\n"); + //update LRU stack on a hit + GpuTlbEntry *entry = lookup(vaddr, true); + + if (entry) + tlb_hit = true; + + if (!update_stats) { + // functional tlb access for memory initialization + // i.e., memory seeding or instr. seeding -> don't update + // TLB and stats + return tlb_hit; + } + + localNumTLBAccesses++; + + if (!entry) { + localNumTLBMisses++; + } else { + localNumTLBHits++; + } + } + } + + return tlb_hit; + } + + Fault + GpuTLB::translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + bool &delayedResponse, bool timing, int &latency) + { + uint32_t flags = req->getFlags(); + int seg = flags & SegmentFlagMask; + bool storeCheck = flags & (StoreCheck << FlagShift); + + // If this is true, we're dealing with a request + // to a non-memory address space. + if (seg == SEGMENT_REG_MS) { + return translateInt(req, tc); + } + + delayedResponse = false; + Addr vaddr = req->getVaddr(); + DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr); + + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + + // If protected mode has been enabled... + if (m5Reg.prot) { + DPRINTF(GPUTLB, "In protected mode.\n"); + // If we're not in 64-bit mode, do protection/limit checks + if (m5Reg.mode != LongMode) { + DPRINTF(GPUTLB, "Not in long mode. Checking segment " + "protection.\n"); + + // Check for a null segment selector. + if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR || + seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS) + && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) { + return std::make_shared(0); + } + + bool expandDown = false; + SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg)); + + if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) { + if (!attr.writable && (mode == BaseTLB::Write || + storeCheck)) + return std::make_shared(0); + + if (!attr.readable && mode == BaseTLB::Read) + return std::make_shared(0); + + expandDown = attr.expandDown; + + } + + Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg)); + Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg)); + // This assumes we're not in 64 bit mode. If we were, the + // default address size is 64 bits, overridable to 32. + int size = 32; + bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift)); + SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR); + + if ((csAttr.defaultSize && sizeOverride) || + (!csAttr.defaultSize && !sizeOverride)) { + size = 16; + } + + Addr offset = bits(vaddr - base, size - 1, 0); + Addr endOffset = offset + req->getSize() - 1; + + if (expandDown) { + DPRINTF(GPUTLB, "Checking an expand down segment.\n"); + warn_once("Expand down segments are untested.\n"); + + if (offset <= limit || endOffset <= limit) + return std::make_shared(0); + } else { + if (offset > limit || endOffset > limit) + return std::make_shared(0); + } + } + + // If paging is enabled, do the translation. + if (m5Reg.paging) { + DPRINTF(GPUTLB, "Paging enabled.\n"); + // The vaddr already has the segment base applied. + GpuTlbEntry *entry = lookup(vaddr); + localNumTLBAccesses++; + + if (!entry) { + localNumTLBMisses++; + if (timing) { + latency = missLatency1; + } + + if (FullSystem) { + fatal("GpuTLB doesn't support full-system mode\n"); + } else { + DPRINTF(GPUTLB, "Handling a TLB miss for address %#x " + "at pc %#x.\n", vaddr, tc->instAddr()); + + Process *p = tc->getProcessPtr(); + GpuTlbEntry newEntry; + bool success = p->pTable->lookup(vaddr, newEntry); + + if (!success && mode != BaseTLB::Execute) { + // penalize a "page fault" more + if (timing) { + latency += missLatency2; + } + + if (p->fixupStackFault(vaddr)) + success = p->pTable->lookup(vaddr, newEntry); + } + + if (!success) { + return std::make_shared(vaddr, true, + mode, true, + false); + } else { + newEntry.valid = success; + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", + alignedVaddr, newEntry.pageStart()); + + entry = insert(alignedVaddr, newEntry); + } + + DPRINTF(GPUTLB, "Miss was serviced.\n"); + } + } else { + localNumTLBHits++; + + if (timing) { + latency = hitLatency; + } + } + + // Do paging protection checks. + bool inUser = (m5Reg.cpl == 3 && + !(flags & (CPL0FlagBit << FlagShift))); + + CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); + bool badWrite = (!entry->writable && (inUser || cr0.wp)); + + if ((inUser && !entry->user) || (mode == BaseTLB::Write && + badWrite)) { + // The page must have been present to get into the TLB in + // the first place. We'll assume the reserved bits are + // fine even though we're not checking them. + return std::make_shared(vaddr, true, mode, + inUser, false); + } + + if (storeCheck && badWrite) { + // This would fault if this were a write, so return a page + // fault that reflects that happening. + return std::make_shared(vaddr, true, + BaseTLB::Write, + inUser, false); + } + + + DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection " + "checks.\n", entry->paddr); + + int page_size = entry->size(); + Addr paddr = entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + req->setPaddr(paddr); + + if (entry->uncacheable) + req->setFlags(Request::UNCACHEABLE); + } else { + //Use the address which already has segmentation applied. + DPRINTF(GPUTLB, "Paging disabled.\n"); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr); + req->setPaddr(vaddr); + } + } else { + // Real mode + DPRINTF(GPUTLB, "In real mode.\n"); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr); + req->setPaddr(vaddr); + } + + // Check for an access to the local APIC + if (FullSystem) { + LocalApicBase localApicBase = + tc->readMiscRegNoEffect(MISCREG_APIC_BASE); + + Addr baseAddr = localApicBase.base * PageBytes; + Addr paddr = req->getPaddr(); + + if (baseAddr <= paddr && baseAddr + PageBytes > paddr) { + // Force the access to be uncacheable. + req->setFlags(Request::UNCACHEABLE); + req->setPaddr(x86LocalAPICAddress(tc->contextId(), + paddr - baseAddr)); + } + } + + return NoFault; + }; + + Fault + GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency) + { + bool delayedResponse; + + return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false, + latency); + } + + void + GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, int &latency) + { + bool delayedResponse; + assert(translation); + + Fault fault = GpuTLB::translate(req, tc, translation, mode, + delayedResponse, true, latency); + + if (!delayedResponse) + translation->finish(fault, req, tc, mode); + } + + Walker* + GpuTLB::getWalker() + { + return walker; + } + + + void + GpuTLB::serialize(CheckpointOut &cp) const + { + } + + void + GpuTLB::unserialize(CheckpointIn &cp) + { + } + + void + GpuTLB::regStats() + { + localNumTLBAccesses + .name(name() + ".local_TLB_accesses") + .desc("Number of TLB accesses") + ; + + localNumTLBHits + .name(name() + ".local_TLB_hits") + .desc("Number of TLB hits") + ; + + localNumTLBMisses + .name(name() + ".local_TLB_misses") + .desc("Number of TLB misses") + ; + + localTLBMissRate + .name(name() + ".local_TLB_miss_rate") + .desc("TLB miss rate") + ; + + accessCycles + .name(name() + ".access_cycles") + .desc("Cycles spent accessing this TLB level") + ; + + pageTableCycles + .name(name() + ".page_table_cycles") + .desc("Cycles spent accessing the page table") + ; + + localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses; + + numUniquePages + .name(name() + ".unique_pages") + .desc("Number of unique pages touched") + ; + + localCycles + .name(name() + ".local_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over incoming coalesced reqs") + ; + + localLatency = localCycles / localNumTLBAccesses; + + globalNumTLBAccesses + .name(name() + ".global_TLB_accesses") + .desc("Number of TLB accesses") + ; + + globalNumTLBHits + .name(name() + ".global_TLB_hits") + .desc("Number of TLB hits") + ; + + globalNumTLBMisses + .name(name() + ".global_TLB_misses") + .desc("Number of TLB misses") + ; + + globalTLBMissRate + .name(name() + ".global_TLB_miss_rate") + .desc("TLB miss rate") + ; + + globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses; + + avgReuseDistance + .name(name() + ".avg_reuse_distance") + .desc("avg. reuse distance over all pages (in ticks)") + ; + + } + + /** + * Do the TLB lookup for this coalesced request and schedule + * another event cycles later. + */ + + void + GpuTLB::issueTLBLookup(PacketPtr pkt) + { + assert(pkt); + assert(pkt->senderState); + + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + TranslationState *sender_state = + safe_cast(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + ThreadContext * tmp_tc = sender_state->tc; + + DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n", + virt_page_addr); + + int req_cnt = sender_state->reqCnt.back(); + + if (update_stats) { + accessCycles -= (curTick() * req_cnt); + localCycles -= curTick(); + updatePageFootprint(virt_page_addr); + globalNumTLBAccesses += req_cnt; + } + + tlbOutcome lookup_outcome = TLB_MISS; + RequestPtr tmp_req = pkt->req; + + // Access the TLB and figure out if it's a hit or a miss. + bool success = tlbLookup(tmp_req, tmp_tc, update_stats); + + if (success) { + lookup_outcome = TLB_HIT; + // Put the entry in SenderState + GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false); + assert(entry); + + sender_state->tlbEntry = + new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid); + + if (update_stats) { + // the reqCnt has an entry per level, so its size tells us + // which level we are in + sender_state->hitLevel = sender_state->reqCnt.size(); + globalNumTLBHits += req_cnt; + } + } else { + if (update_stats) + globalNumTLBMisses += req_cnt; + } + + /* + * We now know the TLB lookup outcome (if it's a hit or a miss), as well + * as the TLB access latency. + * + * We create and schedule a new TLBEvent which will help us take the + * appropriate actions (e.g., update TLB on a hit, send request to lower + * level TLB on a miss, or start a page walk if this was the last-level + * TLB) + */ + TLBEvent *tlb_event = + new TLBEvent(this, virt_page_addr, lookup_outcome, pkt); + + if (translationReturnEvent.count(virt_page_addr)) { + panic("Virtual Page Address %#x already has a return event\n", + virt_page_addr); + } + + translationReturnEvent[virt_page_addr] = tlb_event; + assert(tlb_event); + + DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n", + curTick() + this->ticks(hitLatency)); + + schedule(tlb_event, curTick() + this->ticks(hitLatency)); + } + + GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome, + PacketPtr _pkt) + : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr), + outcome(tlb_outcome), pkt(_pkt) + { + } + + /** + * Do Paging protection checks. If we encounter a page fault, then + * an assertion is fired. + */ + void + GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry * tlb_entry, Mode mode) + { + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & (StoreCheck << FlagShift); + + // Do paging protection checks. + bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift))); + CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); + + bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp)); + + if ((inUser && !tlb_entry->user) || + (mode == BaseTLB::Write && badWrite)) { + // The page must have been present to get into the TLB in + // the first place. We'll assume the reserved bits are + // fine even though we're not checking them. + assert(false); + } + + if (storeCheck && badWrite) { + // This would fault if this were a write, so return a page + // fault that reflects that happening. + assert(false); + } + } + + /** + * handleTranslationReturn is called on a TLB hit, + * when a TLB miss returns or when a page fault returns. + * The latter calls handelHit with TLB miss as tlbOutcome. + */ + void + GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome, + PacketPtr pkt) + { + + assert(pkt); + Addr vaddr = pkt->req->getVaddr(); + + TranslationState *sender_state = + safe_cast(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + Mode mode = sender_state->tlbMode; + + GpuTlbEntry *local_entry, *new_entry; + + if (tlb_outcome == TLB_HIT) { + DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr); + local_entry = sender_state->tlbEntry; + } else { + DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n", + vaddr); + + // We are returning either from a page walk or from a hit at a lower + // TLB level. The senderState should be "carrying" a pointer to the + // correct TLBEntry. + new_entry = sender_state->tlbEntry; + assert(new_entry); + local_entry = new_entry; + + if (allocationPolicy) { + DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n", + virt_page_addr); + + local_entry = insert(virt_page_addr, *new_entry); + } + + assert(local_entry); + } + + /** + * At this point the packet carries an up-to-date tlbEntry pointer + * in its senderState. + * Next step is to do the paging protection checks. + */ + DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks " + "while paddr was %#x.\n", local_entry->vaddr, + local_entry->paddr); + + pagingProtectionChecks(tc, pkt, local_entry, mode); + int page_size = local_entry->size(); + Addr paddr = local_entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + + // Since this packet will be sent through the cpu side slave port, + // it must be converted to a response pkt if it is not one already + if (pkt->isRequest()) { + pkt->makeTimingResponse(); + } + + pkt->req->setPaddr(paddr); + + if (local_entry->uncacheable) { + pkt->req->setFlags(Request::UNCACHEABLE); + } + + //send packet back to coalescer + cpuSidePort[0]->sendTimingResp(pkt); + //schedule cleanup event + cleanupQueue.push(virt_page_addr); + + // schedule this only once per cycle. + // The check is required because we might have multiple translations + // returning the same cycle + // this is a maximum priority event and must be on the same cycle + // as the cleanup event in TLBCoalescer to avoid a race with + // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); + } + + /** + * Here we take the appropriate actions based on the result of the + * TLB lookup. + */ + void + GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt) + { + DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr); + + assert(translationReturnEvent[virtPageAddr]); + assert(pkt); + + TranslationState *tmp_sender_state = + safe_cast(pkt->senderState); + + int req_cnt = tmp_sender_state->reqCnt.back(); + bool update_stats = !tmp_sender_state->prefetch; + + + if (outcome == TLB_HIT) { + handleTranslationReturn(virtPageAddr, TLB_HIT, pkt); + + if (update_stats) { + accessCycles += (req_cnt * curTick()); + localCycles += curTick(); + } + + } else if (outcome == TLB_MISS) { + + DPRINTF(GPUTLB, "This is a TLB miss\n"); + if (update_stats) { + accessCycles += (req_cnt*curTick()); + localCycles += curTick(); + } + + if (hasMemSidePort) { + // the one cyle added here represent the delay from when we get + // the reply back till when we propagate it to the coalescer + // above. + if (update_stats) { + accessCycles += (req_cnt * 1); + localCycles += 1; + } + + /** + * There is a TLB below. Send the coalesced request. + * We actually send the very first packet of all the + * pending packets for this virtual page address. + */ + if (!memSidePort[0]->sendTimingReq(pkt)) { + DPRINTF(GPUTLB, "Failed sending translation request to " + "lower level TLB for addr %#x\n", virtPageAddr); + + memSidePort[0]->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "Sent translation request to lower level " + "TLB for addr %#x\n", virtPageAddr); + } + } else { + //this is the last level TLB. Start a page walk + DPRINTF(GPUTLB, "Last level TLB - start a page walk for " + "addr %#x\n", virtPageAddr); + + if (update_stats) + pageTableCycles -= (req_cnt*curTick()); + + TLBEvent *tlb_event = translationReturnEvent[virtPageAddr]; + assert(tlb_event); + tlb_event->updateOutcome(PAGE_WALK); + schedule(tlb_event, curTick() + ticks(missLatency2)); + } + } else if (outcome == PAGE_WALK) { + if (update_stats) + pageTableCycles += (req_cnt*curTick()); + + // Need to access the page table and update the TLB + DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", + virtPageAddr); + + TranslationState *sender_state = + safe_cast(pkt->senderState); + + Process *p = sender_state->tc->getProcessPtr(); + TlbEntry newEntry; + Addr vaddr = pkt->req->getVaddr(); + #ifndef NDEBUG + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + assert(alignedVaddr == virtPageAddr); + #endif + bool success; + success = p->pTable->lookup(vaddr, newEntry); + if (!success && sender_state->tlbMode != BaseTLB::Execute) { + if (p->fixupStackFault(vaddr)) { + success = p->pTable->lookup(vaddr, newEntry); + } + } + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = + new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success); + + handleTranslationReturn(virtPageAddr, TLB_MISS, pkt); + } else if (outcome == MISS_RETURN) { + /** we add an extra cycle in the return path of the translation + * requests in between the various TLB levels. + */ + handleTranslationReturn(virtPageAddr, TLB_MISS, pkt); + } else { + assert(false); + } + } + + void + GpuTLB::TLBEvent::process() + { + tlb->translationReturn(virtPageAddr, outcome, pkt); + } + + const char* + GpuTLB::TLBEvent::description() const + { + return "trigger translationDoneEvent"; + } + + void + GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome) + { + outcome = _outcome; + } + + Addr + GpuTLB::TLBEvent::getTLBEventVaddr() + { + return virtPageAddr; + } + + /* + * recvTiming receives a coalesced timing request from a TLBCoalescer + * and it calls issueTLBLookup() + * It only rejects the packet if we have exceeded the max + * outstanding number of requests for the TLB + */ + bool + GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt) + { + if (tlb->outstandingReqs < tlb->maxCoalescedReqs) { + tlb->issueTLBLookup(pkt); + // update number of outstanding translation requests + tlb->outstandingReqs++; + return true; + } else { + DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n", + tlb->outstandingReqs); + return false; + } + } + + /** + * handleFuncTranslationReturn is called on a TLB hit, + * when a TLB miss returns or when a page fault returns. + * It updates LRU, inserts the TLB entry on a miss + * depending on the allocation policy and does the required + * protection checks. It does NOT create a new packet to + * update the packet's addr; this is done in hsail-gpu code. + */ + void + GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome) + { + TranslationState *sender_state = + safe_cast(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + Mode mode = sender_state->tlbMode; + Addr vaddr = pkt->req->getVaddr(); + + GpuTlbEntry *local_entry, *new_entry; + + if (tlb_outcome == TLB_HIT) { + DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr " + "%#x\n", vaddr); + + local_entry = sender_state->tlbEntry; + } else { + DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr " + "%#x\n", vaddr); + + // We are returning either from a page walk or from a hit at a lower + // TLB level. The senderState should be "carrying" a pointer to the + // correct TLBEntry. + new_entry = sender_state->tlbEntry; + assert(new_entry); + local_entry = new_entry; + + if (allocationPolicy) { + Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes); + + DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n", + virt_page_addr); + + local_entry = insert(virt_page_addr, *new_entry); + } + + assert(local_entry); + } + + DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks " + "while paddr was %#x.\n", local_entry->vaddr, + local_entry->paddr); + + // Do paging checks if it's a normal functional access. If it's for a + // prefetch, then sometimes you can try to prefetch something that won't + // pass protection. We don't actually want to fault becuase there is no + // demand access to deem this a violation. Just put it in the TLB and + // it will fault if indeed a future demand access touches it in + // violation. + if (!sender_state->prefetch && sender_state->tlbEntry->valid) + pagingProtectionChecks(tc, pkt, local_entry, mode); + + int page_size = local_entry->size(); + Addr paddr = local_entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + + pkt->req->setPaddr(paddr); + + if (local_entry->uncacheable) + pkt->req->setFlags(Request::UNCACHEABLE); + } + + // This is used for atomic translations. Need to + // make it all happen during the same cycle. + void + GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt) + { + TranslationState *sender_state = + safe_cast(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + bool update_stats = !sender_state->prefetch; + + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + if (update_stats) + tlb->updatePageFootprint(virt_page_addr); + + // do the TLB lookup without updating the stats + bool success = tlb->tlbLookup(pkt->req, tc, update_stats); + tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS; + + // functional mode means no coalescing + // global metrics are the same as the local metrics + if (update_stats) { + tlb->globalNumTLBAccesses++; + + if (success) { + sender_state->hitLevel = sender_state->reqCnt.size(); + tlb->globalNumTLBHits++; + } + } + + if (!success) { + if (update_stats) + tlb->globalNumTLBMisses++; + if (tlb->hasMemSidePort) { + // there is a TLB below -> propagate down the TLB hierarchy + tlb->memSidePort[0]->sendFunctional(pkt); + // If no valid translation from a prefetch, then just return + if (sender_state->prefetch && !pkt->req->hasPaddr()) + return; + } else { + // Need to access the page table and update the TLB + DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", + virt_page_addr); + + Process *p = tc->getProcessPtr(); + TlbEntry newEntry; + + Addr vaddr = pkt->req->getVaddr(); + #ifndef NDEBUG + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + assert(alignedVaddr == virt_page_addr); + #endif + + bool success = p->pTable->lookup(vaddr, newEntry); + if (!success && sender_state->tlbMode != BaseTLB::Execute) { + if (p->fixupStackFault(vaddr)) + success = p->pTable->lookup(vaddr, newEntry); + } + + if (!sender_state->prefetch) { + // no PageFaults are permitted after + // the second page table lookup + assert(success); + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr, + newEntry.paddr, + success); + } else { + // If this was a prefetch, then do the normal thing if it + // was a successful translation. Otherwise, send an empty + // TLB entry back so that it can be figured out as empty and + // handled accordingly. + if (success) { + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = new GpuTlbEntry(0, + newEntry.vaddr, + newEntry.paddr, + success); + } else { + DPRINTF(GPUPrefetch, "Prefetch failed %#x\n", + alignedVaddr); + + sender_state->tlbEntry = new GpuTlbEntry(); + + return; + } + } + } + } else { + DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n", + tlb->lookup(pkt->req->getVaddr())); + + GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(), + update_stats); + + assert(entry); + + sender_state->tlbEntry = + new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid); + } + // This is the function that would populate pkt->req with the paddr of + // the translation. But if no translation happens (i.e Prefetch fails) + // then the early returns in the above code wiill keep this function + // from executing. + tlb->handleFuncTranslationReturn(pkt, tlb_outcome); + } + + void + GpuTLB::CpuSidePort::recvReqRetry() + { + // The CPUSidePort never sends anything but replies. No retries + // expected. + assert(false); + } + + AddrRangeList + GpuTLB::CpuSidePort::getAddrRanges() const + { + // currently not checked by the master + AddrRangeList ranges; + + return ranges; + } + + /** + * MemSidePort receives the packet back. + * We need to call the handleTranslationReturn + * and propagate up the hierarchy. + */ + bool + GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt) + { + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n", + virt_page_addr); + + TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr]; + assert(tlb_event); + assert(virt_page_addr == tlb_event->getTLBEventVaddr()); + + tlb_event->updateOutcome(MISS_RETURN); + tlb->schedule(tlb_event, curTick()+tlb->ticks(1)); + + return true; + } + + void + GpuTLB::MemSidePort::recvReqRetry() + { + // No retries should reach the TLB. The retries + // should only reach the TLBCoalescer. + assert(false); + } + + void + GpuTLB::cleanup() + { + while (!cleanupQueue.empty()) { + Addr cleanup_addr = cleanupQueue.front(); + cleanupQueue.pop(); + + // delete TLBEvent + TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr]; + delete old_tlb_event; + translationReturnEvent.erase(cleanup_addr); + + // update number of outstanding requests + outstandingReqs--; + } + + /** the higher level coalescer should retry if it has + * any pending requests. + */ + for (int i = 0; i < cpuSidePort.size(); ++i) { + cpuSidePort[i]->sendRetryReq(); + } + } + + void + GpuTLB::updatePageFootprint(Addr virt_page_addr) + { + + std::pair ret; + + AccessInfo tmp_access_info; + tmp_access_info.lastTimeAccessed = 0; + tmp_access_info.accessesPerPage = 0; + tmp_access_info.totalReuseDistance = 0; + tmp_access_info.sumDistance = 0; + tmp_access_info.meanDistance = 0; + + ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr, + tmp_access_info)); + + bool first_page_access = ret.second; + + if (first_page_access) { + numUniquePages++; + } else { + int accessed_before; + accessed_before = curTick() - ret.first->second.lastTimeAccessed; + ret.first->second.totalReuseDistance += accessed_before; + } + + ret.first->second.accessesPerPage++; + ret.first->second.lastTimeAccessed = curTick(); + + if (accessDistance) { + ret.first->second.localTLBAccesses + .push_back(localNumTLBAccesses.value()); + } + } + + void + GpuTLB::exitCallback() + { + std::ostream *page_stat_file = nullptr; + + if (accessDistance) { + + // print per page statistics to a separate file (.csv format) + // simout is the gem5 output directory (default is m5out or the one + // specified with -d + page_stat_file = simout.create(name().c_str()); + + // print header + *page_stat_file << "page,max_access_distance,mean_access_distance, " + << "stddev_distance" << std::endl; + } + + // update avg. reuse distance footprint + AccessPatternTable::iterator iter, iter_begin, iter_end; + unsigned int sum_avg_reuse_distance_per_page = 0; + + // iterate through all pages seen by this TLB + for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) { + sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance / + iter->second.accessesPerPage; + + if (accessDistance) { + unsigned int tmp = iter->second.localTLBAccesses[0]; + unsigned int prev = tmp; + + for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + if (i) { + tmp = prev + 1; + } + + prev = iter->second.localTLBAccesses[i]; + // update the localTLBAccesses value + // with the actual differece + iter->second.localTLBAccesses[i] -= tmp; + // compute the sum of AccessDistance per page + // used later for mean + iter->second.sumDistance += + iter->second.localTLBAccesses[i]; + } + + iter->second.meanDistance = + iter->second.sumDistance / iter->second.accessesPerPage; + + // compute std_dev and max (we need a second round because we + // need to know the mean value + unsigned int max_distance = 0; + unsigned int stddev_distance = 0; + + for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + unsigned int tmp_access_distance = + iter->second.localTLBAccesses[i]; + + if (tmp_access_distance > max_distance) { + max_distance = tmp_access_distance; + } + + unsigned int diff = + tmp_access_distance - iter->second.meanDistance; + stddev_distance += pow(diff, 2); + + } + + stddev_distance = + sqrt(stddev_distance/iter->second.accessesPerPage); + + if (page_stat_file) { + *page_stat_file << std::hex << iter->first << ","; + *page_stat_file << std::dec << max_distance << ","; + *page_stat_file << std::dec << iter->second.meanDistance + << ","; + *page_stat_file << std::dec << stddev_distance; + *page_stat_file << std::endl; + } + + // erase the localTLBAccesses array + iter->second.localTLBAccesses.clear(); + } + } + + if (!TLBFootprint.empty()) { + avgReuseDistance = + sum_avg_reuse_distance_per_page / TLBFootprint.size(); + } + + //clear the TLBFootprint map + TLBFootprint.clear(); + } +} // namespace X86ISA + +X86ISA::GpuTLB* +X86GPUTLBParams::create() +{ + return new X86ISA::GpuTLB(this); +} + diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh new file mode 100644 index 000000000..3549c598b --- /dev/null +++ b/src/gpu-compute/gpu_tlb.hh @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __GPU_TLB_HH__ +#define __GPU_TLB_HH__ + +#include +#include +#include +#include +#include + +#include "arch/generic/tlb.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/segment.hh" +#include "base/callback.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/compute_unit.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/X86GPUTLB.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +namespace X86ISA +{ + class GpuTlbEntry : public TlbEntry + { + public: + GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid) + : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { } + + GpuTlbEntry() : TlbEntry() { } + + bool valid; + }; + + class GpuTLB : public MemObject + { + protected: + friend class Walker; + + typedef std::list EntryList; + + uint32_t configAddress; + + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the python + // config files. + int clock; + + public: + // clock related functions ; maps to-and-from Simulation ticks and + // object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick + ticks(int numCycles) const + { + return (Tick)clock * numCycles; + } + + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + typedef X86GPUTLBParams Params; + GpuTLB(const Params *p); + ~GpuTLB(); + + typedef enum BaseTLB::Mode Mode; + + class Translation + { + public: + virtual ~Translation() { } + + /** + * Signal that the translation has been delayed due to a hw page + * table walk. + */ + virtual void markDelayed() = 0; + + /** + * The memory for this object may be dynamically allocated, and it + * may be responsible for cleaning itslef up which will happen in + * this function. Once it's called the object is no longer valid. + */ + virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc, + Mode mode) = 0; + }; + + void dumpAll(); + GpuTlbEntry *lookup(Addr va, bool update_lru=true); + void setConfigAddress(uint32_t addr); + + protected: + EntryList::iterator lookupIt(Addr va, bool update_lru=true); + Walker *walker; + + public: + Walker *getWalker(); + void invalidateAll(); + void invalidateNonGlobal(); + void demapPage(Addr va, uint64_t asn); + + protected: + int size; + int assoc; + int numSets; + + /** + * true if this is a fully-associative TLB + */ + bool FA; + Addr setMask; + + /** + * Allocation Policy: true if we always allocate on a hit, false + * otherwise. Default is true. + */ + bool allocationPolicy; + + /** + * if true, then this is not the last level TLB + */ + bool hasMemSidePort; + + /** + * Print out accessDistance stats. One stat file + * per TLB. + */ + bool accessDistance; + + GpuTlbEntry *tlb; + + /* + * It's a per-set list. As long as we have not reached + * the full capacity of the given set, grab an entry from + * the freeList. + */ + std::vector freeList; + + /** + * An entryList per set is the equivalent of an LRU stack; + * it's used to guide replacement decisions. The head of the list + * contains the MRU TLB entry of the given set. If the freeList + * for this set is empty, the last element of the list + * is evicted (i.e., dropped on the floor). + */ + std::vector entryList; + + Fault translateInt(RequestPtr req, ThreadContext *tc); + + Fault translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, bool &delayedResponse, + bool timing, int &latency); + + public: + // latencies for a TLB hit, miss and page fault + int hitLatency; + int missLatency1; + int missLatency2; + + // local_stats are as seen from the TLB + // without taking into account coalescing + Stats::Scalar localNumTLBAccesses; + Stats::Scalar localNumTLBHits; + Stats::Scalar localNumTLBMisses; + Stats::Formula localTLBMissRate; + + // global_stats are as seen from the + // CU's perspective taking into account + // all coalesced requests. + Stats::Scalar globalNumTLBAccesses; + Stats::Scalar globalNumTLBHits; + Stats::Scalar globalNumTLBMisses; + Stats::Formula globalTLBMissRate; + + // from the CU perspective (global) + Stats::Scalar accessCycles; + // from the CU perspective (global) + Stats::Scalar pageTableCycles; + Stats::Scalar numUniquePages; + // from the perspective of this TLB + Stats::Scalar localCycles; + // from the perspective of this TLB + Stats::Formula localLatency; + // I take the avg. per page and then + // the avg. over all pages. + Stats::Scalar avgReuseDistance; + + void regStats(); + void updatePageFootprint(Addr virt_page_addr); + void printAccessPattern(); + + + Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency); + + void translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + int &latency); + + Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); + Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); + + GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry); + + // Checkpointing + virtual void serialize(CheckpointOut& cp) const; + virtual void unserialize(CheckpointIn& cp); + void issueTranslation(); + enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; + bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats); + + void handleTranslationReturn(Addr addr, tlbOutcome outcome, + PacketPtr pkt); + + void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); + + void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry *tlb_entry, Mode mode); + + void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry, + Addr phys_page_addr); + + void issueTLBLookup(PacketPtr pkt); + + // CpuSidePort is the TLB Port closer to the CPU/CU side + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + virtual void recvRespRetry() { assert(false); } + virtual AddrRangeList getAddrRanges() const; + }; + + /** + * MemSidePort is the TLB Port closer to the memory side + * If this is a last level TLB then this port will not be connected. + * + * Future action item: if we ever do real page walks, then this port + * should be connected to a RubyPort. + */ + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + std::deque retries; + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + // TLB ports on the cpu Side + std::vector cpuSidePort; + // TLB ports on the memory side + std::vector memSidePort; + + BaseMasterPort &getMasterPort(const std::string &if_name, + PortID idx=InvalidPortID); + + BaseSlavePort &getSlavePort(const std::string &if_name, + PortID idx=InvalidPortID); + + /** + * TLB TranslationState: this currently is a somewhat bastardization of + * the usage of SenderState, whereby the receiver of a packet is not + * usually supposed to need to look at the contents of the senderState, + * you're really only supposed to look at what you pushed on, pop it + * off, and send it back. + * + * However, since there is state that we want to pass to the TLBs using + * the send/recv Timing/Functional/etc. APIs, which don't allow for new + * arguments, we need a common TLB senderState to pass between TLBs, + * both "forwards" and "backwards." + * + * So, basically, the rule is that any packet received by a TLB port + * (cpuside OR memside) must be safely castable to a TranslationState. + */ + + struct TranslationState : public Packet::SenderState + { + // TLB mode, read or write + Mode tlbMode; + // Thread context associated with this req + ThreadContext *tc; + + /* + * TLB entry to be populated and passed back and filled in + * previous TLBs. Equivalent to the data cache concept of + * "data return." + */ + GpuTlbEntry *tlbEntry; + // Is this a TLB prefetch request? + bool prefetch; + // When was the req for this translation issued + uint64_t issueTime; + // Remember where this came from + std::vectorports; + + // keep track of #uncoalesced reqs per packet per TLB level; + // reqCnt per level >= reqCnt higher level + std::vector reqCnt; + // TLB level this packet hit in; 0 if it hit in the page table + int hitLevel; + Packet::SenderState *saved; + + TranslationState(Mode tlb_mode, ThreadContext *_tc, + bool _prefetch=false, + Packet::SenderState *_saved=nullptr) + : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), + prefetch(_prefetch), issueTime(0), + hitLevel(0),saved(_saved) { } + }; + + // maximum number of permitted coalesced requests per cycle + int maxCoalescedReqs; + + // Current number of outstandings coalesced requests. + // Should be <= maxCoalescedReqs + int outstandingReqs; + + /** + * A TLBEvent is scheduled after the TLB lookup and helps us take the + * appropriate actions: + * (e.g., update TLB on a hit, + * send request to lower level TLB on a miss, + * or start a page walk if this was the last-level TLB). + */ + void translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt); + + class TLBEvent : public Event + { + private: + GpuTLB *tlb; + Addr virtPageAddr; + /** + * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK + */ + tlbOutcome outcome; + PacketPtr pkt; + + public: + TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, + PacketPtr _pkt); + + void process(); + const char *description() const; + + // updateOutcome updates the tlbOutcome of a TLBEvent + void updateOutcome(tlbOutcome _outcome); + Addr getTLBEventVaddr(); + }; + + std::unordered_map translationReturnEvent; + + // this FIFO queue keeps track of the virt. page addresses + // that are pending cleanup + std::queue cleanupQueue; + + // the cleanupEvent is scheduled after a TLBEvent triggers in order to + // free memory and do the required clean-up + void cleanup(); + + EventWrapper cleanupEvent; + + /** + * This hash map will use the virtual page address as a key + * and will keep track of total number of accesses per page + */ + + struct AccessInfo + { + unsigned int lastTimeAccessed; // last access to this page + unsigned int accessesPerPage; + // need to divide it by accessesPerPage at the end + unsigned int totalReuseDistance; + + /** + * The field below will help us compute the access distance, + * that is the number of (coalesced) TLB accesses that + * happened in between each access to this page + * + * localTLBAccesses[x] is the value of localTLBNumAccesses + * when the page was accessed for the th time + */ + std::vector localTLBAccesses; + unsigned int sumDistance; + unsigned int meanDistance; + }; + + typedef std::unordered_map AccessPatternTable; + AccessPatternTable TLBFootprint; + + // Called at the end of simulation to dump page access stats. + void exitCallback(); + + EventWrapper exitEvent; + }; +} + +#endif // __GPU_TLB_HH__ diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh new file mode 100644 index 000000000..9f358e23c --- /dev/null +++ b/src/gpu-compute/hsa_code.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __HSA_CODE_HH__ +#define __HSA_CODE_HH__ + +#include +#include + +#include "arch/gpu_types.hh" +#include "config/the_gpu_isa.hh" + +class HsaKernelInfo; + +/* @class HsaCode + * base code object for the set of HSA kernels associated + * with a single application. this class provides the common + * methods for creating, accessing, and storing information + * about kernel and variable symbols, symbol name, memory + * segment sizes, and instruction count, etc. + */ + +class HsaCode +{ + public: + HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0), + _name(name) + { + } + + enum class MemorySegment { + NONE, + FLAT, + GLOBAL, + READONLY, + KERNARG, + GROUP, + PRIVATE, + SPILL, + ARG, + EXTSPACE0 + }; + + const std::string& name() const { return _name; } + int numInsts() const { return _insts.size(); } + std::vector* insts() { return &_insts; } + + void + setReadonlyData(uint8_t *_readonly_data) + { + readonly_data = _readonly_data; + } + + virtual int getSize(MemorySegment segment) const = 0; + virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0; + + uint8_t *readonly_data; + int funcarg_size; + + protected: + // An array that stores instruction indices (0 through kernel size) + // for a kernel passed to code object constructor as an argument. + std::vector _insts; + + private: + const std::string _name; +}; + +#endif // __HSA_CODE_HH__ diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh new file mode 100644 index 000000000..396913dac --- /dev/null +++ b/src/gpu-compute/hsa_kernel_info.hh @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __HSA_KERNEL_INFO_HH__ +#define __HSA_KERNEL_INFO_HH__ + +// This file defines the public interface between the HSA emulated +// driver and application programs. + +#include + +static const int HSA_GET_SIZES = 0x4801; +static const int HSA_GET_KINFO = 0x4802; +static const int HSA_GET_STRINGS = 0x4803; +static const int HSA_GET_CODE = 0x4804; +static const int HSA_GET_READONLY_DATA = 0x4805; +static const int HSA_GET_CU_CNT = 0x4806; +static const int HSA_GET_VSZ = 0x4807; + +// Return value (via buffer ptr) for HSA_GET_SIZES +struct HsaDriverSizes +{ + uint32_t num_kernels; + uint32_t string_table_size; + uint32_t code_size; + uint32_t readonly_size; +}; + +// HSA_GET_KINFO returns an array of num_kernels of these structs +struct HsaKernelInfo +{ + // byte offset into string table + uint32_t name_offs; + // byte offset into code array + uint32_t code_offs; + uint32_t static_lds_size; + uint32_t private_mem_size; + uint32_t spill_mem_size; + // Number of s registers + uint32_t sRegCount; + // Number of d registers + uint32_t dRegCount; + // Number of c registers + uint32_t cRegCount; +}; + +#endif // __HSA_KERNEL_INFO_HH__ diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc new file mode 100644 index 000000000..91dfb160e --- /dev/null +++ b/src/gpu-compute/hsa_object.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/hsa_object.hh" + +#include + +#include "gpu-compute/brig_object.hh" + +HsaObject::HsaObject(const std::string &fname) + : readonlyData(nullptr), filename(fname) +{ +} + +HsaObject* +HsaObject::createHsaObject(const std::string &fname) +{ + HsaObject *hsaObj = nullptr; + uint8_t *file_data = nullptr; + int file_length = 0; + + std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in | + std::ifstream::binary); + + assert(code_file.is_open()); + assert(code_file.good()); + + file_length = code_file.tellg(); + code_file.seekg(0, code_file.beg); + file_data = new uint8_t[file_length]; + code_file.read((char*)file_data, file_length); + code_file.close(); + + for (const auto &tryFile : tryFileFuncs) { + if ((hsaObj = tryFile(fname, file_length, file_data))) { + return hsaObj; + } + } + + delete[] file_data; + fatal("Unknown HSA object type for file: %s.\n", fname); + + return nullptr; +} diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh new file mode 100644 index 000000000..1f08f5d80 --- /dev/null +++ b/src/gpu-compute/hsa_object.hh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __HSA_OBJECT_HH__ +#define __HSA_OBJECT_HH__ + +#include +#include +#include + +class HsaCode; + +/* @class HsaObject + * base loader object for HSA kernels. this class provides + * the base method definitions for loading, storing, and + * accessing HSA kernel objects into the simulator. + */ + +class HsaObject +{ + public: + HsaObject(const std::string &fileName); + + static HsaObject* createHsaObject(const std::string &fname); + static std::vector> tryFileFuncs; + + virtual HsaCode* getKernel(const std::string &name) const = 0; + virtual HsaCode* getKernel(int i) const = 0; + virtual HsaCode* getFunction(const std::string &name) const = 0; + virtual int numKernels() const = 0; + + const std::string& name() const { return filename; } + + uint8_t *readonlyData; + + + protected: + const std::string filename; +}; + +#endif // __HSA_OBJECT_HH__ diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc new file mode 100644 index 000000000..b0ddf0161 --- /dev/null +++ b/src/gpu-compute/hsail_code.cc @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/hsail_code.hh" + +#include "arch/gpu_types.hh" +#include "arch/hsail/Brig.h" +#include "arch/hsail/operand.hh" +#include "config/the_gpu_isa.hh" +#include "debug/BRIG.hh" +#include "debug/HSAILObject.hh" +#include "gpu-compute/brig_object.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/kernel_cfg.hh" + +using namespace Brig; + +int getBrigDataTypeBytes(BrigType16_t t); + +HsailCode::HsailCode(const std::string &name_str) + : HsaCode(name_str), private_size(-1), readonly_size(-1) +{ +} + +void +HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj, + StorageMap *objStorageMap) +{ + storageMap = objStorageMap; + + // set pointer so that decoding process can find this kernel context when + // needed + obj->currentCode = this; + + if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION && + code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) { + fatal("unexpected directive kind %d inside kernel/function init\n", + code_dir->base.kind); + } + + DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n", + code_dir->firstCodeBlockEntry); + + // clear these static vars so we can properly track the max index + // for this kernel + SRegOperand::maxRegIdx = 0; + DRegOperand::maxRegIdx = 0; + CRegOperand::maxRegIdx = 0; + setPrivateSize(0); + + const BrigBase *entryPtr = brigNext((BrigBase*)code_dir); + const BrigBase *endPtr = + obj->getCodeSectionEntry(code_dir->nextModuleEntry); + + int inst_idx = 0; + std::vector instructions; + int funcarg_size_scope = 0; + + // walk through instructions in code section and directives in + // directive section in parallel, processing directives that apply + // when we reach the relevant code point. + while (entryPtr < endPtr) { + switch (entryPtr->kind) { + case BRIG_KIND_DIRECTIVE_VARIABLE: + { + const BrigDirectiveVariable *sym = + (const BrigDirectiveVariable*)entryPtr; + + DPRINTF(HSAILObject,"Initializing code, directive is " + "kind_variable, symbol is: %s\n", + obj->getString(sym->name)); + + StorageElement *se = storageMap->addSymbol(sym, obj); + + if (sym->segment == BRIG_SEGMENT_PRIVATE) { + setPrivateSize(se->size); + } else { // spill + funcarg_size_scope += se->size; + } + } + break; + + case BRIG_KIND_DIRECTIVE_LABEL: + { + const BrigDirectiveLabel *lbl = + (const BrigDirectiveLabel*)entryPtr; + + DPRINTF(HSAILObject,"Initializing code, directive is " + "kind_label, label is: %s \n", + obj->getString(lbl->name)); + + labelMap.addLabel(lbl, inst_idx, obj); + } + break; + + case BRIG_KIND_DIRECTIVE_PRAGMA: + { + DPRINTF(HSAILObject, "Initializing code, directive " + "is kind_pragma\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_COMMENT: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_comment\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_arg_block_start\n"); + + storageMap->resetOffset(BRIG_SEGMENT_ARG); + funcarg_size_scope = 0; + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_arg_block_end\n"); + + funcarg_size = funcarg_size < funcarg_size_scope ? + funcarg_size_scope : funcarg_size; + } + break; + + case BRIG_KIND_DIRECTIVE_END: + DPRINTF(HSAILObject, "Initializing code, dircetive is " + "kind_end\n"); + + break; + + default: + if (entryPtr->kind >= BRIG_KIND_INST_BEGIN && + entryPtr->kind <= BRIG_KIND_INST_END) { + + BrigInstBase *instPtr = (BrigInstBase*)entryPtr; + TheGpuISA::MachInst machInst = { instPtr, obj }; + GPUStaticInst *iptr = decoder.decode(machInst); + + if (iptr) { + DPRINTF(HSAILObject, "Initializing code, processing inst " + "#%d idx %d: OPCODE=%d\n", + inst_idx, _insts.size(), instPtr->opcode); + + TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr); + iptr->instNum(inst_idx); + _insts.push_back(inst_num); + instructions.push_back(iptr); + } + ++inst_idx; + } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN && + entryPtr->kind < BRIG_KIND_OPERAND_END) { + warn("unexpected operand entry in code segment\n"); + } else { + // there are surely some more cases we will need to handle, + // but we'll deal with them as we find them. + fatal("unexpected directive kind %d inside kernel scope\n", + entryPtr->kind); + } + } + + entryPtr = brigNext(entryPtr); + } + + // compute Control Flow Graph for current kernel + ControlFlowInfo::assignImmediatePostDominators(instructions); + + max_sreg = SRegOperand::maxRegIdx; + max_dreg = DRegOperand::maxRegIdx; + max_creg = CRegOperand::maxRegIdx; + + obj->currentCode = nullptr; +} + +HsailCode::HsailCode(const std::string &name_str, + const BrigDirectiveExecutable *code_dir, + const BrigObject *obj, StorageMap *objStorageMap) + : HsaCode(name_str), private_size(-1), readonly_size(-1) +{ + init(code_dir, obj, objStorageMap); +} + +void +LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index, + const BrigObject *obj) +{ + std::string lbl_name = obj->getString(lblDir->name); + Label &lbl = map[lbl_name]; + + if (lbl.defined()) { + fatal("Attempt to redefine existing label %s\n", lbl_name); + } + + lbl.define(lbl_name, inst_index); + DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index); +} + +Label* +LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir, + const BrigObject *obj) +{ + std::string name = obj->getString(lblDir->name); + Label &lbl = map[name]; + lbl.checkName(name); + + return &lbl; +} + +int +getBrigDataTypeBytes(BrigType16_t t) +{ + switch (t) { + case BRIG_TYPE_S8: + case BRIG_TYPE_U8: + case BRIG_TYPE_B8: + return 1; + + case BRIG_TYPE_S16: + case BRIG_TYPE_U16: + case BRIG_TYPE_B16: + case BRIG_TYPE_F16: + return 2; + + case BRIG_TYPE_S32: + case BRIG_TYPE_U32: + case BRIG_TYPE_B32: + case BRIG_TYPE_F32: + return 4; + + case BRIG_TYPE_S64: + case BRIG_TYPE_U64: + case BRIG_TYPE_B64: + case BRIG_TYPE_F64: + return 8; + + case BRIG_TYPE_B1: + + default: + fatal("unhandled symbol data type %d", t); + return 0; + } +} + +StorageElement* +StorageSpace::addSymbol(const BrigDirectiveVariable *sym, + const BrigObject *obj) +{ + const char *sym_name = obj->getString(sym->name); + uint64_t size = 0; + uint64_t offset = 0; + + if (sym->type & BRIG_TYPE_ARRAY) { + size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY); + size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo); + + offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type & + ~BRIG_TYPE_ARRAY)); + } else { + size = getBrigDataTypeBytes(sym->type); + offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type)); + } + + nextOffset = offset + size; + + DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n", + segmentNames[segment], sym_name, size, offset, sym->init); + + StorageElement* se = new StorageElement(sym_name, offset, size, sym); + elements.push_back(se); + elements_by_addr.insert(AddrRange(offset, offset + size - 1), se); + elements_by_brigptr[sym] = se; + + return se; +} + +StorageElement* +StorageSpace::findSymbol(std::string name) +{ + for (auto it : elements) { + if (it->name == name) { + return it; + } + } + + return nullptr; +} + +StorageElement* +StorageSpace::findSymbol(uint64_t addr) +{ + assert(elements_by_addr.size() > 0); + + auto se = elements_by_addr.find(addr); + + if (se == elements_by_addr.end()) { + return nullptr; + } else { + return se->second; + } +} + +StorageElement* +StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr) +{ + assert(elements_by_brigptr.size() > 0); + + auto se = elements_by_brigptr.find(brigptr); + + if (se == elements_by_brigptr.end()) { + return nullptr; + } else { + return se->second; + } +} + +StorageMap::StorageMap(StorageMap *outerScope) + : outerScopeMap(outerScope) +{ + for (int i = 0; i < NumSegments; ++i) + space[i] = new StorageSpace((BrigSegment)i); +} + +StorageElement* +StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj) +{ + BrigSegment8_t segment = sym->segment; + + assert(segment >= Brig::BRIG_SEGMENT_FLAT); + assert(segment < NumSegments); + + return space[segment]->addSymbol(sym, obj); +} + +int +StorageMap::getSize(Brig::BrigSegment segment) +{ + assert(segment > Brig::BRIG_SEGMENT_GLOBAL); + assert(segment < NumSegments); + + if (segment != Brig::BRIG_SEGMENT_GROUP && + segment != Brig::BRIG_SEGMENT_READONLY) { + return space[segment]->getSize(); + } else { + int ret = space[segment]->getSize(); + + if (outerScopeMap) { + ret += outerScopeMap->getSize(segment); + } + + return ret; + } +} + +void +StorageMap::resetOffset(Brig::BrigSegment segment) +{ + space[segment]->resetOffset(); +} + +StorageElement* +StorageMap::findSymbol(BrigSegment segment, std::string name) +{ + StorageElement *se = space[segment]->findSymbol(name); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, name); + + return nullptr; +} + +StorageElement* +StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr) +{ + StorageSpace *sp = space[segment]; + + if (!sp) { + // there is no memory in segment? + return nullptr; + } + + StorageElement *se = sp->findSymbol(addr); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, addr); + + return nullptr; + +} + +StorageElement* +StorageMap::findSymbol(Brig::BrigSegment segment, + const BrigDirectiveVariable *brigptr) +{ + StorageSpace *sp = space[segment]; + + if (!sp) { + // there is no memory in segment? + return nullptr; + } + + StorageElement *se = sp->findSymbol(brigptr); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, brigptr); + + return nullptr; + +} diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh new file mode 100644 index 000000000..d9fbcc577 --- /dev/null +++ b/src/gpu-compute/hsail_code.hh @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __HSAIL_CODE_HH__ +#define __HSAIL_CODE_HH__ + +#include +#include +#include +#include +#include + +#include "arch/gpu_decoder.hh" +#include "arch/hsail/Brig.h" +#include "base/addr_range_map.hh" +#include "base/intmath.hh" +#include "config/the_gpu_isa.hh" +#include "gpu-compute/hsa_code.hh" +#include "gpu-compute/hsa_kernel_info.hh" +#include "gpu-compute/misc.hh" + +class BrigObject; +class GPUStaticInst; + +inline int +popcount(uint64_t src, int sz) +{ + int cnt = 0; + + for (int i = 0; i < sz; ++i) { + if (src & 1) + ++cnt; + src >>= 1; + } + + return cnt; +} + +inline int +firstbit(uint64_t src, int sz) +{ + int i; + + for (i = 0; i < sz; ++i) { + if (src & 1) + break; + src >>= 1; + } + + return i; +} + +inline int +lastbit(uint64_t src, int sz) +{ + int i0 = -1; + + for (int i = 0; i < sz; ++i) { + if (src & 1) + i0 = i; + src >>= 1; + } + + return i0; +} + +inline int +signbit(uint64_t src, int sz) +{ + int i0 = -1; + + if (src & (1 << (sz - 1))) { + for (int i = 0; i < sz - 1; ++i) { + if (!(src & 1)) + i0 = i; + src >>= 1; + } + } else { + for (int i = 0; i < sz - 1; ++i) { + if (src & 1) + i0 = i; + src >>= 1; + } + } + + return i0; +} + +inline uint64_t +bitrev(uint64_t src, int sz) +{ + uint64_t r = 0; + + for (int i = 0; i < sz; ++i) { + r <<= 1; + if (src & 1) + r |= 1; + src >>= 1; + } + + return r; +} + +inline uint64_t +mul_hi(uint32_t a, uint32_t b) +{ + return ((uint64_t)a * (uint64_t)b) >> 32; +} + +inline uint64_t +mul_hi(int32_t a, int32_t b) +{ + return ((int64_t)a * (int64_t)b) >> 32; +} + +inline uint64_t +mul_hi(uint64_t a, uint64_t b) +{ + return ((uint64_t)a * (uint64_t)b) >> 32; +} + +inline uint64_t +mul_hi(int64_t a, int64_t b) +{ + return ((int64_t)a * (int64_t)b) >> 32; +} + +inline uint64_t +mul_hi(double a, double b) +{ + return 0; +} + +class Label +{ + public: + std::string name; + int value; + + Label() : value(-1) + { + } + + bool defined() { return value != -1; } + + void + checkName(std::string &_name) + { + if (name.empty()) { + name = _name; + } else { + assert(name == _name); + } + } + + void + define(std::string &_name, int _value) + { + assert(!defined()); + assert(_value != -1); + value = _value; + checkName(_name); + } + + int + get() + { + assert(defined()); + return value; + } +}; + +class LabelMap +{ + std::map map; + + public: + LabelMap() { } + + void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index, + const BrigObject *obj); + + Label *refLabel(const Brig::BrigDirectiveLabel *lbl, + const BrigObject *obj); +}; + +const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN; + +extern const char *segmentNames[]; + +class StorageElement +{ + public: + std::string name; + uint64_t offset; + + uint64_t size; + const Brig::BrigDirectiveVariable *brigSymbol; + StorageElement(const char *_name, uint64_t _offset, int _size, + const Brig::BrigDirectiveVariable *sym) + : name(_name), offset(_offset), size(_size), brigSymbol(sym) + { + } +}; + +class StorageSpace +{ + typedef std::map + DirVarToSE_map; + + std::list elements; + AddrRangeMap elements_by_addr; + DirVarToSE_map elements_by_brigptr; + + uint64_t nextOffset; + Brig::BrigSegment segment; + + public: + StorageSpace(Brig::BrigSegment _class) + : nextOffset(0), segment(_class) + { + } + + StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym, + const BrigObject *obj); + + StorageElement* findSymbol(std::string name); + StorageElement* findSymbol(uint64_t addr); + StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr); + + int getSize() { return nextOffset; } + void resetOffset() { nextOffset = 0; } +}; + +class StorageMap +{ + StorageMap *outerScopeMap; + StorageSpace *space[NumSegments]; + + public: + StorageMap(StorageMap *outerScope = nullptr); + + StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym, + const BrigObject *obj); + + StorageElement* findSymbol(Brig::BrigSegment segment, std::string name); + StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr); + + StorageElement* findSymbol(Brig::BrigSegment segment, + const Brig::BrigDirectiveVariable *brigptr); + + // overloaded version to avoid casting + StorageElement* + findSymbol(Brig::BrigSegment8_t segment, std::string name) + { + return findSymbol((Brig::BrigSegment)segment, name); + } + + int getSize(Brig::BrigSegment segment); + void resetOffset(Brig::BrigSegment segment); +}; + +typedef enum +{ + BT_DEFAULT, + BT_B8, + BT_U8, + BT_U16, + BT_U32, + BT_U64, + BT_S8, + BT_S16, + BT_S32, + BT_S64, + BT_F16, + BT_F32, + BT_F64, + BT_NULL +} base_type_e; + +/* @class HsailCode + * the HsailCode class is used to store information + * about HSA kernels stored in the BRIG format. it holds + * all information about a kernel, function, or variable + * symbol and provides methods for accessing that + * information. + */ + +class HsailCode final : public HsaCode +{ + public: + TheGpuISA::Decoder decoder; + + StorageMap *storageMap; + LabelMap labelMap; + uint32_t kernarg_start; + uint32_t kernarg_end; + int32_t private_size; + + int32_t readonly_size; + + // We track the maximum register index used for each register + // class when we load the code so we can size the register files + // appropriately (i.e., one more than the max index). + uint32_t max_creg; // maximum c-register index + uint32_t max_sreg; // maximum s-register index + uint32_t max_dreg; // maximum d-register index + + HsailCode(const std::string &name_str, + const Brig::BrigDirectiveExecutable *code_dir, + const BrigObject *obj, + StorageMap *objStorageMap); + + // this version is used to create a placeholder when + // we encounter a kernel-related directive before the + // kernel itself + HsailCode(const std::string &name_str); + + void init(const Brig::BrigDirectiveExecutable *code_dir, + const BrigObject *obj, StorageMap *objStorageMap); + + void + generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const + { + hsaKernelInfo->sRegCount = max_sreg + 1; + hsaKernelInfo->dRegCount = max_dreg + 1; + hsaKernelInfo->cRegCount = max_creg + 1; + + hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP); + + hsaKernelInfo->private_mem_size = + roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8); + + hsaKernelInfo->spill_mem_size = + roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8); + } + + int + getSize(MemorySegment segment) const + { + Brig::BrigSegment brigSeg; + + switch (segment) { + case MemorySegment::NONE: + brigSeg = Brig::BRIG_SEGMENT_NONE; + break; + case MemorySegment::FLAT: + brigSeg = Brig::BRIG_SEGMENT_FLAT; + break; + case MemorySegment::GLOBAL: + brigSeg = Brig::BRIG_SEGMENT_GLOBAL; + break; + case MemorySegment::READONLY: + brigSeg = Brig::BRIG_SEGMENT_READONLY; + break; + case MemorySegment::KERNARG: + brigSeg = Brig::BRIG_SEGMENT_KERNARG; + break; + case MemorySegment::GROUP: + brigSeg = Brig::BRIG_SEGMENT_GROUP; + break; + case MemorySegment::PRIVATE: + brigSeg = Brig::BRIG_SEGMENT_PRIVATE; + break; + case MemorySegment::SPILL: + brigSeg = Brig::BRIG_SEGMENT_SPILL; + break; + case MemorySegment::ARG: + brigSeg = Brig::BRIG_SEGMENT_ARG; + break; + case MemorySegment::EXTSPACE0: + brigSeg = Brig::BRIG_SEGMENT_AMD_GCN; + break; + default: + fatal("Unknown BrigSegment type.\n"); + } + + return getSize(brigSeg); + } + + private: + int + getSize(Brig::BrigSegment segment) const + { + if (segment == Brig::BRIG_SEGMENT_PRIVATE) { + // with the code generated by new HSA compiler the assertion + // does not hold anymore.. + //assert(private_size != -1); + return private_size; + } else { + return storageMap->getSize(segment); + } + } + + public: + StorageElement* + findSymbol(Brig::BrigSegment segment, uint64_t addr) + { + return storageMap->findSymbol(segment, addr); + } + + void + setPrivateSize(int32_t _private_size) + { + private_size = _private_size; + } + + Label* + refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj) + { + return labelMap.refLabel(lbl, obj); + } +}; + +#endif // __HSAIL_CODE_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc new file mode 100644 index 000000000..7e0e10912 --- /dev/null +++ b/src/gpu-compute/kernel_cfg.cc @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/kernel_cfg.hh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gpu-compute/gpu_static_inst.hh" + +void +ControlFlowInfo::assignImmediatePostDominators( + const std::vector& instructions) +{ + ControlFlowInfo cfg(instructions); + cfg.findImmediatePostDominators(); +} + + +ControlFlowInfo::ControlFlowInfo(const std::vector& insts) : + instructions(insts) +{ + createBasicBlocks(); + connectBasicBlocks(); +} + +BasicBlock* +ControlFlowInfo::basicBlock(int inst_num) const { + for (auto& block: basicBlocks) { + int first_block_id = block->firstInstruction->instNum(); + if (inst_num >= first_block_id && + inst_num < first_block_id + block->size) { + return block.get(); + } + } + return nullptr; +} + + +GPUStaticInst* +ControlFlowInfo::lastInstruction(const BasicBlock* block) const +{ + if (block->isExit()) { + return nullptr; + } + + return instructions.at(block->firstInstruction->instNum() + + block->size - 1); +} + +BasicBlock* +ControlFlowInfo::postDominator(const BasicBlock* block) const +{ + if (block->isExit()) { + return nullptr; + } + return basicBlock(lastInstruction(block)->ipdInstNum()); +} + +void +ControlFlowInfo::createBasicBlocks() +{ + assert(!instructions.empty()); + std::set leaders; + // first instruction is a leader + leaders.insert(0); + for (int i = 1; i < instructions.size(); i++) { + GPUStaticInst* instruction = instructions[i]; + if (instruction->o_type == Enums::OT_BRANCH) { + const int target_pc = instruction->getTargetPc(); + leaders.insert(target_pc); + leaders.insert(i + 1); + } + } + + size_t block_size = 0; + for (int i = 0; i < instructions.size(); i++) { + if (leaders.find(i) != leaders.end()) { + uint32_t id = basicBlocks.size(); + if (id > 0) { + basicBlocks.back()->size = block_size; + } + block_size = 0; + basicBlocks.emplace_back(new BasicBlock(id, instructions[i])); + } + block_size++; + } + basicBlocks.back()->size = block_size; + // exit basic block + basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr)); +} + +void +ControlFlowInfo::connectBasicBlocks() +{ + BasicBlock* exit_bb = basicBlocks.back().get(); + for (auto& bb : basicBlocks) { + if (bb->isExit()) { + break; + } + GPUStaticInst* last = lastInstruction(bb.get()); + if (last->o_type == Enums::OT_RET) { + bb->successorIds.insert(exit_bb->id); + break; + } + if (last->o_type == Enums::OT_BRANCH) { + const uint32_t target_pc = last->getTargetPc(); + BasicBlock* target_bb = basicBlock(target_pc); + bb->successorIds.insert(target_bb->id); + } + + // Unconditional jump instructions have a unique successor + if (!last->unconditionalJumpInstruction()) { + BasicBlock* next_bb = basicBlock(last->instNum() + 1); + bb->successorIds.insert(next_bb->id); + } + } +} + + +// In-place set intersection +static void +intersect(std::set& a, const std::set& b) +{ + std::set::iterator it = a.begin(); + while (it != a.end()) { + it = b.find(*it) != b.end() ? ++it : a.erase(it); + } +} + + +void +ControlFlowInfo::findPostDominators() +{ + // the only postdominator of the exit block is itself + basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id); + //copy all basic blocks to all postdominator lists except for exit block + for (auto& block : basicBlocks) { + if (!block->isExit()) { + for (uint32_t i = 0; i < basicBlocks.size(); i++) { + block->postDominatorIds.insert(i); + } + } + } + + bool change = true; + while (change) { + change = false; + for (int h = basicBlocks.size() - 2; h >= 0; --h) { + size_t num_postdominators = + basicBlocks[h]->postDominatorIds.size(); + for (int s : basicBlocks[h]->successorIds) { + intersect(basicBlocks[h]->postDominatorIds, + basicBlocks[s]->postDominatorIds); + } + basicBlocks[h]->postDominatorIds.insert(h); + change |= (num_postdominators + != basicBlocks[h]->postDominatorIds.size()); + } + } +} + + +// In-place set difference +static void +setDifference(std::set&a, + const std::set& b, uint32_t exception) +{ + for (uint32_t b_elem : b) { + if (b_elem != exception) { + a.erase(b_elem); + } + } +} + +void +ControlFlowInfo::findImmediatePostDominators() +{ + assert(basicBlocks.size() > 1); // Entry and exit blocks must be present + + findPostDominators(); + + for (auto& basicBlock : basicBlocks) { + if (basicBlock->isExit()) { + continue; + } + std::set candidates = basicBlock->postDominatorIds; + candidates.erase(basicBlock->id); + for (uint32_t postDominatorId : basicBlock->postDominatorIds) { + if (postDominatorId != basicBlock->id) { + setDifference(candidates, + basicBlocks[postDominatorId]->postDominatorIds, + postDominatorId); + } + } + assert(candidates.size() == 1); + GPUStaticInst* last_instruction = lastInstruction(basicBlock.get()); + BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get(); + if (!ipd_block->isExit()) { + GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction; + last_instruction->ipdInstNum(ipd_first_inst->instNum()); + } else { + last_instruction->ipdInstNum(last_instruction->instNum() + 1); + } + } +} + +void +ControlFlowInfo::printPostDominators() const +{ + for (auto& block : basicBlocks) { + std::cout << "PD(" << block->id << ") = {"; + std::copy(block->postDominatorIds.begin(), + block->postDominatorIds.end(), + std::ostream_iterator(std::cout, ", ")); + std::cout << "}" << std::endl; + } +} + +void +ControlFlowInfo::printImmediatePostDominators() const +{ + for (const auto& block : basicBlocks) { + if (block->isExit()) { + continue; + } + std::cout << "IPD(" << block->id << ") = "; + std::cout << postDominator(block.get())->id << ", "; + } + std::cout << std::endl; +} +void +ControlFlowInfo::printBasicBlocks() const +{ + for (GPUStaticInst* inst : instructions) { + int inst_num = inst->instNum(); + std::cout << inst_num << " [" << basicBlock(inst_num)->id + << "]: " << inst->disassemble(); + if (inst->o_type == Enums::OT_BRANCH) { + std::cout << ", PC = " << inst->getTargetPc(); + } + std::cout << std::endl; + } +} + +void +ControlFlowInfo::printBasicBlockDot() const +{ + printf("digraph {\n"); + for (const auto& basic_block : basicBlocks) { + printf("\t"); + for (uint32_t successorId : basic_block->successorIds) { + printf("%d -> %d; ", basic_block->id, successorId); + } + printf("\n"); + } + printf("}\n"); +} diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh new file mode 100644 index 000000000..74ea861d8 --- /dev/null +++ b/src/gpu-compute/kernel_cfg.hh @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __KERNEL_CFG_HH__ +#define __KERNEL_CFG_HH__ + +#include +#include +#include +#include +#include + + +class GPUStaticInst; +class HsailCode; + +struct BasicBlock +{ + BasicBlock(uint32_t num, GPUStaticInst* begin) : + id(num), size(0), firstInstruction(begin) + { + } + + bool + isEntry() const + { + return !id; + } + + bool + isExit() const + { + return !size; + } + + /** + * Unique identifier for the block within a given kernel. + */ + const uint32_t id; + + /** + * Number of instructions contained in the block + */ + size_t size; + + /** + * Pointer to first instruction of the block. + */ + GPUStaticInst* firstInstruction; + + /** + * Identifiers of the blocks that follow (are reachable from) this block. + */ + std::set successorIds; + + /** + * Identifiers of the blocks that will be visited from this block. + */ + std::set postDominatorIds; +}; + +class ControlFlowInfo +{ +public: + + /** + * Compute immediate post-dominator instruction for kernel instructions. + */ + static void assignImmediatePostDominators( + const std::vector& instructions); + +private: + ControlFlowInfo(const std::vector& instructions); + + GPUStaticInst* lastInstruction(const BasicBlock* block) const; + + BasicBlock* basicBlock(int inst_num) const; + + BasicBlock* postDominator(const BasicBlock* block) const; + + void createBasicBlocks(); + + void connectBasicBlocks(); + + void findPostDominators(); + + void findImmediatePostDominators(); + + void printBasicBlocks() const; + + void printBasicBlockDot() const; + + void printPostDominators() const; + + void printImmediatePostDominators() const; + + std::vector> basicBlocks; + std::vector instructions; +}; + +#endif // __KERNEL_CFG_HH__ diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc new file mode 100644 index 000000000..91ee8009a --- /dev/null +++ b/src/gpu-compute/lds_state.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#include "gpu-compute/lds_state.hh" + +#include +#include +#include + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +/** + * the default constructor that works with SWIG + */ +LdsState::LdsState(const Params *params) : + MemObject(params), + tickEvent(this), + cuPort(name() + ".port", this), + maximumSize(params->size), + range(params->range), + bankConflictPenalty(params->bankConflictPenalty), + banks(params->banks) +{ + fatal_if(params->banks <= 0, + "Number of LDS banks should be positive number"); + fatal_if((params->banks & (params->banks - 1)) != 0, + "Number of LDS banks should be a power of 2"); + fatal_if(params->size <= 0, + "cannot allocate an LDS with a size less than 1"); + fatal_if(params->size % 2, + "the LDS should be an even number"); +} + +/** + * Needed by the SWIG compiler + */ +LdsState * +LdsStateParams::create() +{ + return new LdsState(this); +} + +/** + * set the parent and name based on the parent + */ +void +LdsState::setParent(ComputeUnit *x_parent) +{ + // check that this gets assigned to the same thing each time + fatal_if(!x_parent, "x_parent should not be nullptr"); + fatal_if(x_parent == parent, + "should not be setting the parent twice"); + + parent = x_parent; + _name = x_parent->name() + ".LdsState"; +} + +/** + * derive the gpu mem packet from the packet and then count the bank conflicts + */ +unsigned +LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses) +{ + Packet::SenderState *baseSenderState = packet->senderState; + while (baseSenderState->predecessor) { + baseSenderState = baseSenderState->predecessor; + } + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast(baseSenderState); + + fatal_if(!senderState, + "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + return countBankConflicts(gpuDynInst, bankAccesses); +} + +// Count the total number of bank conflicts for the local memory packet +unsigned +LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses) +{ + int bank_conflicts = 0; + std::vector bank; + // the number of LDS banks being touched by the memory instruction + int numBanks = std::min(parent->wfSize(), banks); + // if the wavefront size is larger than the number of LDS banks, we + // need to iterate over all work items to calculate the total + // number of bank conflicts + int groups = (parent->wfSize() > numBanks) ? + (parent->wfSize() / numBanks) : 1; + for (int i = 0; i < groups; i++) { + // Address Array holding all the work item addresses of an instruction + std::vector addr_array; + addr_array.resize(numBanks, 0); + bank.clear(); + bank.resize(banks, 0); + int max_bank = 0; + + // populate the address array for all active work items + for (int j = 0; j < numBanks; j++) { + if (gpuDynInst->exec_mask[(i*numBanks)+j]) { + addr_array[j] = gpuDynInst->addr[(i*numBanks)+j]; + } else { + addr_array[j] = std::numeric_limits::max(); + } + } + + if (gpuDynInst->m_op == Enums::MO_LD || + gpuDynInst->m_op == Enums::MO_ST) { + // mask identical addresses + for (int j = 0; j < numBanks; ++j) { + for (int j0 = 0; j0 < j; j0++) { + if (addr_array[j] != std::numeric_limits::max() + && addr_array[j] == addr_array[j0]) { + addr_array[j] = std::numeric_limits::max(); + } + } + } + } + // calculate bank conflicts + for (int j = 0; j < numBanks; ++j) { + if (addr_array[j] != std::numeric_limits::max()) { + int bankId = addr_array[j] % banks; + bank[bankId]++; + max_bank = std::max(max_bank, bank[bankId]); + // Count the number of LDS banks accessed. + // Since we have masked identical addresses all remaining + // accesses will need to be serialized if they access + // the same bank (bank conflict). + (*numBankAccesses)++; + } + } + bank_conflicts += max_bank; + } + panic_if(bank_conflicts > parent->wfSize(), + "Max bank conflicts should match num of work items per instr"); + return bank_conflicts; +} + +/** + * receive the packet from the CU + */ +bool +LdsState::CuSidePort::recvTimingReq(PacketPtr packet) +{ + return ownerLds->processPacket(packet); +} + +GPUDynInstPtr +LdsState::getDynInstr(PacketPtr packet) +{ + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast( + packet->senderState); + return ss->getMemInst(); +} + +/** + * process an incoming packet, add it to the return queue + */ +bool +LdsState::processPacket(PacketPtr packet) +{ + unsigned bankAccesses = 0; + // the number of conflicts this packet will have when accessing the LDS + unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); + // count the total number of physical LDS bank accessed + parent->ldsBankAccesses += bankAccesses; + // count the LDS bank conflicts. A number set to 1 indicates one + // access per bank maximum so there are no bank conflicts + parent->ldsBankConflictDist.sample(bankConflicts-1); + + GPUDynInstPtr dynInst = getDynInstr(packet); + // account for the LDS bank conflict overhead + int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : + (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + parent->loadBusLength(); + // delay for accessing the LDS + Tick processingTime = + parent->shader->ticks(bankConflicts * bankConflictPenalty) + + parent->shader->ticks(busLength); + // choose (delay + last packet in queue) or (now + delay) as the time to + // return this + Tick doneAt = earliestReturnTime() + processingTime; + // then store it for processing + return returnQueuePush(std::make_pair(doneAt, packet)); +} + +/** + * add this to the queue of packets to be returned + */ +bool +LdsState::returnQueuePush(std::pair thePair) +{ + // TODO add time limits (e.g. one packet per cycle) and queue size limits + // and implement flow control + returnQueue.push(thePair); + + // if there is no set wakeup time, look through the queue + if (!tickEvent.scheduled()) { + process(); + } + + return true; +} + +/** + * receive a packet in functional mode + */ +void +LdsState::CuSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("not implemented"); +} + +/** + * receive a retry for a response + */ +void +LdsState::CuSidePort::recvRespRetry() +{ + // TODO verify that this is the right way to do this + assert(ownerLds->isRetryResp()); + ownerLds->setRetryResp(false); + ownerLds->process(); +} + +/** + * receive a retry + */ +void +LdsState::CuSidePort::recvRetry() +{ + fatal("not implemented"); +} + +/** + * look for packets to return at this time + */ +bool +LdsState::process() +{ + Tick now = clockEdge(); + + // send back completed packets + while (!returnQueue.empty() && returnQueue.front().first <= now) { + PacketPtr packet = returnQueue.front().second; + + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast( + packet->senderState); + + GPUDynInstPtr gpuDynInst = ss->getMemInst(); + + gpuDynInst->initiateAcc(gpuDynInst); + + packet->makeTimingResponse(); + + returnQueue.pop(); + + bool success = cuPort.sendTimingResp(packet); + + if (!success) { + retryResp = true; + panic("have not handled timing responses being NACK'd when sent" + "back"); + } + } + + // determine the next wakeup time + if (!returnQueue.empty()) { + + Tick next = returnQueue.front().first; + + if (tickEvent.scheduled()) { + + if (next < tickEvent.when()) { + + tickEvent.deschedule(); + tickEvent.schedule(next); + } + } else { + tickEvent.schedule(next); + } + } + + return true; +} + +/** + * wake up at this time and perform specified actions + */ +void +LdsState::TickEvent::process() +{ + ldsState->process(); +} + +/** + * + */ +void +LdsState::regStats() +{ +} diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh new file mode 100644 index 000000000..89f08a1d3 --- /dev/null +++ b/src/gpu-compute/lds_state.hh @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#ifndef __LDS_STATE_HH__ +#define __LDS_STATE_HH__ + +#include +#include +#include +#include +#include +#include + +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "gpu-compute/misc.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "params/LdsState.hh" + +class ComputeUnit; + +/** + * this represents a slice of the overall LDS, intended to be associated with an + * individual workgroup + */ +class LdsChunk +{ + public: + LdsChunk(const uint32_t x_size): + chunk(x_size) + { + } + + LdsChunk() {} + + /** + * a read operation + */ + template + T + read(const uint32_t index) + { + fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + T *p0 = (T *) (&(chunk.at(index))); + return *p0; + } + + /** + * a write operation + */ + template + void + write(const uint32_t index, const T value) + { + fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + T *p0 = (T *) (&(chunk.at(index))); + *p0 = value; + } + + /** + * get the size of this chunk + */ + std::vector::size_type + size() const + { + return chunk.size(); + } + + protected: + // the actual data store for this slice of the LDS + std::vector chunk; +}; + +// Local Data Share (LDS) State per Wavefront (contents of the LDS region +// allocated to the WorkGroup of this Wavefront) +class LdsState: public MemObject +{ + protected: + + /** + * an event to allow event-driven execution + */ + class TickEvent: public Event + { + protected: + + LdsState *ldsState = nullptr; + + Tick nextTick = 0; + + public: + + TickEvent(LdsState *_ldsState) : + ldsState(_ldsState) + { + } + + virtual void + process(); + + void + schedule(Tick when) + { + mainEventQueue[0]->schedule(this, when); + } + + void + deschedule() + { + mainEventQueue[0]->deschedule(this); + } + }; + + /** + * CuSidePort is the LDS Port closer to the CU side + */ + class CuSidePort: public SlavePort + { + public: + CuSidePort(const std::string &_name, LdsState *_ownerLds) : + SlavePort(_name, _ownerLds), ownerLds(_ownerLds) + { + } + + protected: + LdsState *ownerLds; + + virtual bool + recvTimingReq(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) + { + return 0; + } + + virtual void + recvFunctional(PacketPtr pkt); + + virtual void + recvRangeChange() + { + } + + virtual void + recvRetry(); + + virtual void + recvRespRetry(); + + virtual AddrRangeList + getAddrRanges() const + { + AddrRangeList ranges; + ranges.push_back(ownerLds->getAddrRange()); + return ranges; + } + + template + void + loadData(PacketPtr packet); + + template + void + storeData(PacketPtr packet); + + template + void + atomicOperation(PacketPtr packet); + }; + + protected: + + // the lds reference counter + // The key is the workgroup ID and dispatch ID + // The value is the number of wavefronts that reference this LDS, as + // wavefronts are launched, the counter goes up for that workgroup and when + // they return it decreases, once it reaches 0 then this chunk of the LDS is + // returned to the available pool. However,it is deallocated on the 1->0 + // transition, not whenever the counter is 0 as it always starts with 0 when + // the workgroup asks for space + std::unordered_map> refCounter; + + // the map that allows workgroups to access their own chunk of the LDS + std::unordered_map> chunkMap; + + // an event to allow the LDS to wake up at a specified time + TickEvent tickEvent; + + // the queue of packets that are going back to the CU after a + // read/write/atomic op + // TODO need to make this have a maximum size to create flow control + std::queue> returnQueue; + + // whether or not there are pending responses + bool retryResp = false; + + bool + process(); + + GPUDynInstPtr + getDynInstr(PacketPtr packet); + + bool + processPacket(PacketPtr packet); + + unsigned + countBankConflicts(PacketPtr packet, unsigned *bankAccesses); + + unsigned + countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses); + + public: + typedef LdsStateParams Params; + + LdsState(const Params *params); + + // prevent copy construction + LdsState(const LdsState&) = delete; + + ~LdsState() + { + parent = nullptr; + } + + const Params * + params() const + { + return dynamic_cast(_params); + } + + bool + isRetryResp() const + { + return retryResp; + } + + void + setRetryResp(const bool value) + { + retryResp = value; + } + + // prevent assignment + LdsState & + operator=(const LdsState &) = delete; + + /** + * use the dynamic wave id to create or just increase the reference count + */ + int + increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId) + { + int refCount = getRefCounter(dispatchId, wgId); + fatal_if(refCount < 0, + "reference count should not be below zero"); + return ++refCounter[dispatchId][wgId]; + } + + /** + * decrease the reference count after making sure it is in the list + * give back this chunk if the ref counter has reached 0 + */ + int + decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId) + { + int refCount = getRefCounter(dispatchId, wgId); + + fatal_if(refCount <= 0, + "reference count should not be below zero or at zero to" + "decrement"); + + refCounter[dispatchId][wgId]--; + + if (refCounter[dispatchId][wgId] == 0) { + releaseSpace(dispatchId, wgId); + return 0; + } else { + return refCounter[dispatchId][wgId]; + } + } + + /** + * return the current reference count for this workgroup id + */ + int + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const + { + auto dispatchIter = chunkMap.find(dispatchId); + fatal_if(dispatchIter == chunkMap.end(), + "could not locate this dispatch id [%d]", dispatchId); + + auto workgroup = dispatchIter->second.find(wgId); + fatal_if(workgroup == dispatchIter->second.end(), + "could not find this workgroup id within this dispatch id" + " did[%d] wgid[%d]", dispatchId, wgId); + + auto refCountIter = refCounter.find(dispatchId); + if (refCountIter == refCounter.end()) { + fatal("could not locate this dispatch id [%d]", dispatchId); + } else { + auto workgroup = refCountIter->second.find(wgId); + if (workgroup == refCountIter->second.end()) { + fatal("could not find this workgroup id within this dispatch id" + " did[%d] wgid[%d]", dispatchId, wgId); + } else { + return refCounter.at(dispatchId).at(wgId); + } + } + + fatal("should not reach this point"); + return 0; + } + + /** + * assign a parent and request this amount of space be set aside + * for this wgid + */ + LdsChunk * + reserveSpace(const uint32_t dispatchId, const uint32_t wgId, + const uint32_t size) + { + if (chunkMap.find(dispatchId) != chunkMap.end()) { + fatal_if( + chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(), + "duplicate workgroup ID asking for space in the LDS " + "did[%d] wgid[%d]", dispatchId, wgId); + } + + fatal_if(bytesAllocated + size > maximumSize, + "request would ask for more space than is available"); + + bytesAllocated += size; + + chunkMap[dispatchId].emplace(wgId, LdsChunk(size)); + // make an entry for this workgroup + refCounter[dispatchId][wgId] = 0; + + return &chunkMap[dispatchId][wgId]; + } + + bool + returnQueuePush(std::pair thePair); + + Tick + earliestReturnTime() const + { + // TODO set to max(lastCommand+1, curTick()) + return returnQueue.empty() ? curTick() : returnQueue.back().first; + } + + void + setParent(ComputeUnit *x_parent); + + void + regStats(); + + // accessors + ComputeUnit * + getParent() const + { + return parent; + } + + std::string + getName() + { + return _name; + } + + int + getBanks() const + { + return banks; + } + + ComputeUnit * + getComputeUnit() const + { + return parent; + } + + int + getBankConflictPenalty() const + { + return bankConflictPenalty; + } + + /** + * get the allocated size for this workgroup + */ + std::size_t + ldsSize(const uint32_t x_wgId) + { + return chunkMap[x_wgId].size(); + } + + AddrRange + getAddrRange() const + { + return range; + } + + virtual BaseSlavePort & + getSlavePort(const std::string& if_name, PortID idx) + { + if (if_name == "cuPort") { + // TODO need to set name dynamically at this point? + return cuPort; + } else { + fatal("cannot resolve the port name " + if_name); + } + } + + /** + * can this much space be reserved for a workgroup? + */ + bool + canReserve(uint32_t x_size) const + { + return bytesAllocated + x_size <= maximumSize; + } + + private: + /** + * give back the space + */ + bool + releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId) + { + auto dispatchIter = chunkMap.find(x_dispatchId); + + if (dispatchIter == chunkMap.end()) { + fatal("dispatch id not found [%d]", x_dispatchId); + } else { + auto workgroupIter = dispatchIter->second.find(x_wgId); + if (workgroupIter == dispatchIter->second.end()) { + fatal("workgroup id [%d] not found in dispatch id [%d]", + x_wgId, x_dispatchId); + } + } + + fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(), + "releasing more space than was allocated"); + + bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size(); + chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId)); + return true; + } + + // the port that connects this LDS to its owner CU + CuSidePort cuPort; + + ComputeUnit* parent = nullptr; + + std::string _name; + + // the number of bytes currently reserved by all workgroups + int bytesAllocated = 0; + + // the size of the LDS, the most bytes available + int maximumSize; + + // Address range of this memory + AddrRange range; + + // the penalty, in cycles, for each LDS bank conflict + int bankConflictPenalty = 0; + + // the number of banks in the LDS underlying data store + int banks = 0; +}; + +#endif // __LDS_STATE_HH__ diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc new file mode 100644 index 000000000..7f919c5f4 --- /dev/null +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/local_memory_pipeline.hh" + +#include "debug/GPUPort.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size) +{ +} + +void +LocalMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".LocalMemPipeline"; +} + +void +LocalMemPipeline::exec() +{ + // apply any returned shared (LDS) memory operations + GPUDynInstPtr m = !lmReturnedRequests.empty() ? + lmReturnedRequests.front() : nullptr; + + bool accessVrf = true; + if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + accessVrf = + w->computeUnit->vrf[m->simdId]-> + vrfOperandAccessReady(m->seqNum(), w, m, + VrfAccessType::WRITE); + } + + if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && + computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return + || computeUnit->wfWait.at(m->pipeId).rdy())) { + if (m->v_type == VT_32 && m->m_type == Enums::M_U8) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) + doSmReturn(m); + } + + // If pipeline has executed a local memory instruction + // execute local memory packet and issue the packets + // to LDS + if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) { + + GPUDynInstPtr m = lmIssuedRequests.front(); + + bool returnVal = computeUnit->sendToLds(m); + if (!returnVal) { + DPRINTF(GPUPort, "packet was nack'd and put in retry queue"); + } + lmIssuedRequests.pop(); + } +} + +template +void +LocalMemPipeline::doSmReturn(GPUDynInstPtr m) +{ + lmReturnedRequests.pop(); + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + // Return data to registers + if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + std::vector regVec; + for (int k = 0; k < m->n_reg; ++k) { + int dst = m->dst_reg+k; + + if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = m->dst_reg_vec[k]; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst,sizeof(c0),1); + // save the physical VGPR index + regVec.push_back(physVgpr); + c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (m->exec_mask[i]) { + // write the value into the physical VGPR. This is a purely + // functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. This simply + // models the timing aspect of the VRF write operation. It does not + // modify the physical VGPR. + loadVrfBankConflictCycles += + w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w, + regVec, sizeof(c0), m->time); + } + + // Decrement outstanding request count + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1); + + if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) + || MO_H(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm, + m->time, -1); + } + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->locMemToVrfBus.set(m->time); + if (computeUnit->shader->coissue_return == 0) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); +} + +void +LocalMemPipeline::regStats() +{ + loadVrfBankConflictCycles + .name(name() + ".load_vrf_bank_conflict_cycles") + .desc("total number of cycles LDS data are delayed before updating " + "the VRF") + ; +} diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh new file mode 100644 index 000000000..a63d867d0 --- /dev/null +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __LOCAL_MEMORY_PIPELINE_HH__ +#define __LOCAL_MEMORY_PIPELINE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file local_memory_pipeline.hh + * + * The local memory pipeline issues newly created local memory packets + * from pipeline to the LDS. This stage also retires previously issued + * loads and stores that have returned from the LDS. + */ + +class ComputeUnit; +class Wavefront; + +class LocalMemPipeline +{ + public: + LocalMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + template void doSmReturn(GPUDynInstPtr m); + + std::queue &getLMReqFIFO() { return lmIssuedRequests; } + std::queue &getLMRespFIFO() { return lmReturnedRequests; } + + bool + isLMRespFIFOWrRdy() const + { + return lmReturnedRequests.size() < lmQueueSize; + } + + bool + isLMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (lmIssuedRequests.size() + pendReqs) < lmQueueSize; + } + + const std::string& name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int lmQueueSize; + Stats::Scalar loadVrfBankConflictCycles; + // Local Memory Request Fifo: all shared memory requests + // are issued to this FIFO from the memory pipelines + std::queue lmIssuedRequests; + + // Local Memory Response Fifo: all responses of shared memory + // requests are sent to this FIFO from LDS + std::queue lmReturnedRequests; +}; + +#endif // __LOCAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh new file mode 100644 index 000000000..4f8032832 --- /dev/null +++ b/src/gpu-compute/misc.hh @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __MISC_HH__ +#define __MISC_HH__ + +#include +#include + +#include "base/misc.hh" + +class GPUDynInst; + +// wavefront size of the machine +static const int VSZ = 64; + +/* + This check is necessary because std::bitset only provides conversion to + unsigned long or unsigned long long via to_ulong() or to_ullong(). there are + a few places in the code where to_ullong() is used, however if VSZ is larger + than a value the host can support then bitset will throw a runtime exception. + + we should remove all use of to_long() or to_ullong() so we can have VSZ + greater than 64b, however until that is done this assert is required. + */ +static_assert(VSZ <= sizeof(unsigned long long) * 8, + "VSZ is larger than the host can support"); + +typedef std::bitset VectorMask; +typedef std::shared_ptr GPUDynInstPtr; + +class WaitClass +{ + public: + WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { } + void init(uint64_t *_tcnt, uint32_t _numStages=0) + { + tcnt = _tcnt; + numStages = _numStages; + } + + void set(uint32_t i) + { + fatal_if(nxtAvail > *tcnt, + "Can't allocate resource because it is busy!!!"); + nxtAvail = *tcnt + i; + } + void preset(uint32_t delay) + { + lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages); + } + bool rdy() const { return *tcnt >= nxtAvail; } + bool prerdy() const { return *tcnt >= lookAheadAvail; } + + private: + // timestamp indicating when resource will be available + uint64_t nxtAvail; + // timestamp indicating when resource will be available including + // pending uses of the resource (when there is a cycle gap between + // rdy() and set() + uint64_t lookAheadAvail; + // current timestamp + uint64_t *tcnt; + // number of stages between checking if a resource is ready and + // setting the resource's utilization + uint32_t numStages; +}; + +class Float16 +{ + public: + uint16_t val; + + Float16() { val = 0; } + + Float16(const Float16 &x) : val(x.val) { } + + Float16(float x) + { + uint32_t ai = *(uint32_t *)&x; + + uint32_t s = (ai >> 31) & 0x1; + uint32_t exp = (ai >> 23) & 0xff; + uint32_t mant = (ai >> 0) & 0x7fffff; + + if (exp == 0 || exp <= 0x70) { + exp = 0; + mant = 0; + } else if (exp == 0xff) { + exp = 0x1f; + } else if (exp >= 0x8f) { + exp = 0x1f; + mant = 0; + } else { + exp = exp - 0x7f + 0x0f; + } + + mant = mant >> 13; + + val = 0; + val |= (s << 15); + val |= (exp << 10); + val |= (mant << 0); + } + + operator float() const + { + uint32_t s = (val >> 15) & 0x1; + uint32_t exp = (val >> 10) & 0x1f; + uint32_t mant = (val >> 0) & 0x3ff; + + if (!exp) { + exp = 0; + mant = 0; + } else if (exp == 0x1f) { + exp = 0xff; + } else { + exp = exp - 0x0f + 0x7f; + } + + uint32_t val1 = 0; + val1 |= (s << 31); + val1 |= (exp << 23); + val1 |= (mant << 13); + + return *(float*)&val1; + } +}; + +#endif // __MISC_HH__ diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh new file mode 100644 index 000000000..d1ad35d4b --- /dev/null +++ b/src/gpu-compute/ndrange.hh @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __NDRANGE_HH__ +#define __NDRANGE_HH__ + +#include "base/types.hh" +#include "gpu-compute/qstruct.hh" + +struct NDRange +{ + // copy of the queue entry provided at dispatch + HsaQueueEntry q; + + // The current workgroup id (3 dimensions) + int wgId[3]; + // The number of workgroups in each dimension + int numWg[3]; + // The total number of workgroups + int numWgTotal; + + // The number of completed work groups + int numWgCompleted; + // The global workgroup ID + uint32_t globalWgId; + + // flag indicating whether all work groups have been launched + bool wg_disp_rem; + // kernel complete + bool execDone; + bool userDoorBellSet; + volatile bool *addrToNotify; + volatile uint32_t *numDispLeft; + int dispatchId; + int curTid; // Current thread id +}; + +#endif // __NDRANGE_HH__ diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc new file mode 100644 index 000000000..7f114706a --- /dev/null +++ b/src/gpu-compute/of_scheduling_policy.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/of_scheduling_policy.hh" + +#include "gpu-compute/wavefront.hh" + +Wavefront* +OFSchedulingPolicy::chooseWave() +{ + // Set when policy choose a wave to schedule + bool waveChosen = false; + Wavefront *selectedWave = nullptr; + int selectedWaveID = -1; + uint32_t selectedPosition = 0; + + for (int position = 0; position < scheduleList->size(); ++position) { + Wavefront *curWave = scheduleList->at(position); + uint32_t curWaveID = curWave->wfDynId; + + // Choosed wave with the lowest wave ID + if (selectedWaveID == -1 || curWaveID < selectedWaveID) { + waveChosen = true; + selectedWaveID = curWaveID; + selectedWave = curWave; + selectedPosition = position; + } + } + + // Check to make sure ready list had atleast one schedulable wave + if (waveChosen) { + scheduleList->erase(scheduleList->begin() + selectedPosition); + } else { + panic("Empty ready list"); + } + + return selectedWave; +} + +void +OFSchedulingPolicy::bindList(std::vector *list) +{ + scheduleList = list; +} diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh new file mode 100644 index 000000000..684e51a3a --- /dev/null +++ b/src/gpu-compute/of_scheduling_policy.hh @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __OF_SCHEDULING_POLICY_HH__ +#define __OF_SCHEDULING_POLICY_HH__ + +#include +#include + +#include "base/misc.hh" + +class Wavefront; + +// Oldest First where age is marked by the wave id +class OFSchedulingPolicy +{ + public: + OFSchedulingPolicy() : scheduleList(nullptr) { } + + Wavefront* chooseWave(); + void bindList(std::vector *list); + + private: + // List of waves which are participating in scheduling. + // This scheduler selects the oldest wave from this list + std::vector *scheduleList; +}; + +#endif // __OF_SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc new file mode 100644 index 000000000..b1bc6b1f3 --- /dev/null +++ b/src/gpu-compute/pool_manager.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/pool_manager.hh" + +PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize) + : _minAllocation(minAlloc), _poolSize(poolSize) +{ + assert(poolSize > 0); +} diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh new file mode 100644 index 000000000..2cb53ce72 --- /dev/null +++ b/src/gpu-compute/pool_manager.hh @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __POOL_MANAGER_HH__ +#define __POOL_MANAGER_HH__ + +#include +#include +#include + +// Pool Manager Logic +class PoolManager +{ + public: + PoolManager(uint32_t minAlloc, uint32_t poolSize); + uint32_t minAllocation() { return _minAllocation; } + virtual std::string printRegion() = 0; + virtual uint32_t regionSize(std::pair ®ion) = 0; + virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0; + + virtual uint32_t allocateRegion(const uint32_t size, + uint32_t *reserved) = 0; + + virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0; + uint32_t poolSize() { return _poolSize; } + + private: + // minimum size that can be reserved per allocation + uint32_t _minAllocation; + // pool size in number of elements + uint32_t _poolSize; +}; + +#endif // __POOL_MANAGER_HH__ diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh new file mode 100644 index 000000000..092303c00 --- /dev/null +++ b/src/gpu-compute/qstruct.hh @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + +#ifndef __Q_STRUCT_HH__ +#define __Q_STRUCT_HH__ + +#include +#include + +// Maximum number of arguments +static const int KER_NUM_ARGS = 32; +// Kernel argument buffer size +static const int KER_ARGS_LENGTH = 512; + +class LdsChunk; +struct NDRange; + +// Be very careful of alignment in this structure. The structure +// must compile to the same layout in both 32-bit and 64-bit mode. +struct HsaQueueEntry +{ + // Base pointer for array of instruction pointers + uint64_t code_ptr; + // Grid Size (3 dimensions) + uint32_t gdSize[3]; + // Workgroup Size (3 dimensions) + uint32_t wgSize[3]; + uint16_t sRegCount; + uint16_t dRegCount; + uint16_t cRegCount; + uint64_t privMemStart; + uint32_t privMemPerItem; + uint32_t privMemTotal; + uint64_t spillMemStart; + uint32_t spillMemPerItem; + uint32_t spillMemTotal; + uint64_t roMemStart; + uint32_t roMemTotal; + // Size (in bytes) of LDS + uint32_t ldsSize; + // Virtual Memory Id (unused right now) + uint32_t vmId; + + // Pointer to dependency chain (unused now) + uint64_t depends; + + // pointer to bool + uint64_t addrToNotify; + // pointer to uint32_t + uint64_t numDispLeft; + + // variables to pass arguments when running in standalone mode, + // will be removed when run.py and sh.cpp have been updated to + // use args and offset arrays + uint64_t arg1; + uint64_t arg2; + uint64_t arg3; + uint64_t arg4; + + // variables to pass arguments when running in cpu+gpu mode + uint8_t args[KER_ARGS_LENGTH]; + uint16_t offsets[KER_NUM_ARGS]; + uint16_t num_args; +}; + +// State used to start (or restart) a WF +struct WFContext +{ + // 32 bit values + // barrier state + int bar_cnt[VSZ]; + + // id (which WF in the WG) + int cnt; + + // more barrier state + int max_bar_cnt; + int old_barrier_cnt; + int barrier_cnt; + + // More Program Counter Stuff + uint32_t pc; + + // Program counter of the immediate post-dominator instruction + uint32_t rpc; + + // WG wide state (I don't see how to avoid redundancy here) + int cu_id; + uint32_t wg_id; + uint32_t barrier_id; + + // 64 bit values (these values depend on the wavefront size) + // masks + uint64_t init_mask; + uint64_t exec_mask; + + // private memory; + Addr privBase; + Addr spillBase; + + LdsChunk *ldsChunk; + + /* + * Kernel wide state + * This is a hack. This state should be moved through simulated memory + * during a yield. Though not much is being used here, so it's probably + * probably not a big deal. + * + * Just to add to this comment... The ndr is derived from simulated + * memory when the cl-runtime allocates an HsaQueueEntry and populates it + * for a kernel launch. So in theory the runtime should be able to keep + * that state around. Then a WF can reference it upon restart to derive + * kernel wide state. The runtime can deallocate the state when the + * kernel completes. + */ + NDRange *ndr; +}; + +// State that needs to be passed between the simulation and simulated app, a +// pointer to this struct can be passed through the depends field in the +// HsaQueueEntry struct +struct HostState +{ + // cl_event* has original HsaQueueEntry for init + uint64_t event; +}; + +// Total number of HSA queues +static const int HSAQ_NQUEUES = 8; + +// These values will eventually live in memory mapped registers +// and be settable by the kernel mode driver. + +// Number of entries in each HSA queue +static const int HSAQ_SIZE = 64; +// Address of first HSA queue index +static const int HSAQ_INDX_BASE = 0x10000ll; +// Address of first HSA queue +static const int HSAQ_BASE = 0x11000ll; +// Suggested start of HSA code +static const int HSA_CODE_BASE = 0x18000ll; + +// These are shortcuts for deriving the address of a specific +// HSA queue or queue index +#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n) +#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue)) +#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0)) +#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1)) +#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2)) + +/* + * Example code for writing to a queue + * + * void + * ToQueue(int n,struct fsaQueue *val) + * { + * int wi = *(int*)HSAQ_WI(n); + * int ri = *(int*)HSAQ_RI(n); + * int ci = *(int*)HSAQ_CI(n); + * + * if (ci - ri < HSAQ_SIZE) { + * (*(int*)HSAQ_CI(n))++; + * *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val; + * (*(int*)HSAQ_WI(n))++; + * } + * } + */ + +#endif // __Q_STRUCT_HH__ diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc new file mode 100644 index 000000000..5d3591901 --- /dev/null +++ b/src/gpu-compute/rr_scheduling_policy.cc @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/rr_scheduling_policy.hh" + +#include "gpu-compute/wavefront.hh" + +Wavefront* +RRSchedulingPolicy::chooseWave() +{ + Wavefront *selectedWave = nullptr; + + // Check to make sure ready list had atleast one schedulable wave + if (scheduleList->size()) { + // For RR policy, select the wave which is at the + // front of the list. The selected wave is popped + // out from the schedule list immediately after selection + // to avoid starvation. It is the responsibility of the + // module invoking the RR scheduler to make surei scheduling + // eligible waves are added to the back of the schedule + // list + selectedWave = scheduleList->front(); + scheduleList->erase(scheduleList->begin() + 0); + } else { + panic("Empty ready list"); + } + + return selectedWave; +} + +void +RRSchedulingPolicy::bindList(std::vector *list) +{ + scheduleList = list; +} diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh new file mode 100644 index 000000000..780f294aa --- /dev/null +++ b/src/gpu-compute/rr_scheduling_policy.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __RR_SCHEDULING_POLICY_HH__ +#define __RR_SCHEDULING_POLICY_HH__ + +#include + +#include +#include +#include + +#include "base/misc.hh" + +class Wavefront; + +// Round-Robin pick among the list of ready waves +class RRSchedulingPolicy +{ + public: + RRSchedulingPolicy() : scheduleList(nullptr) { } + + Wavefront* chooseWave(); + void bindList(std::vector *list); + + private: + // List of waves which are participating in scheduling. + // This scheduler selects one wave from this list based on + // round robin policy + std::vector *scheduleList; +}; + +#endif // __RR_SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc new file mode 100644 index 000000000..068136026 --- /dev/null +++ b/src/gpu-compute/schedule_stage.cc @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/schedule_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +ScheduleStage::ScheduleStage(const ComputeUnitParams *p) + : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes) +{ + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + Scheduler newScheduler(p); + scheduler.push_back(newScheduler); + } +} + +ScheduleStage::~ScheduleStage() +{ + scheduler.clear(); + waveStatusList.clear(); +} + +void +ScheduleStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScheduleStage"; + + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + scheduler[j].bindList(&computeUnit->readyList[j]); + } + + for (int j = 0; j < numSIMDs; ++j) { + waveStatusList.push_back(&computeUnit->waveStatusList[j]); + } + + dispatchList = &computeUnit->dispatchList; +} + +void +ScheduleStage::arbitrate() +{ + // iterate over all Memory pipelines + for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) { + if (dispatchList->at(j).first) { + Wavefront *waveToMemPipe = dispatchList->at(j).first; + // iterate over all execution pipelines + for (int i = 0; i < numSIMDs + numMemUnits; ++i) { + if ((i != j) && (dispatchList->at(i).first)) { + Wavefront *waveToExePipe = dispatchList->at(i).first; + // if the two selected wavefronts are mapped to the same + // SIMD unit then they share the VRF + if (waveToMemPipe->simdId == waveToExePipe->simdId) { + int simdId = waveToMemPipe->simdId; + // Read VRF port arbitration: + // If there are read VRF port conflicts between the + // a memory and another instruction we drop the other + // instruction. We don't need to check for write VRF + // port conflicts because the memory instruction either + // does not need to write to the VRF (store) or will + // write to the VRF when the data comes back (load) in + // which case the arbiter of the memory pipes will + // resolve any conflicts + if (computeUnit->vrf[simdId]-> + isReadConflict(waveToMemPipe->wfSlotId, + waveToExePipe->wfSlotId)) { + // FIXME: The "second" member variable is never + // used in the model. I am setting it to READY + // simply to follow the protocol of setting it + // when the WF has an instruction ready to issue + waveStatusList[simdId]->at(waveToExePipe->wfSlotId) + .second = READY; + + dispatchList->at(i).first = nullptr; + dispatchList->at(i).second = EMPTY; + break; + } + } + } + } + } + } +} + +void +ScheduleStage::exec() +{ + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + uint32_t readyListSize = computeUnit->readyList[j].size(); + + // If no wave is ready to be scheduled on the execution resource + // then skip scheduling for this execution resource + if (!readyListSize) { + continue; + } + + Wavefront *waveToBeDispatched = scheduler[j].chooseWave(); + dispatchList->at(j).first = waveToBeDispatched; + waveToBeDispatched->updateResources(); + dispatchList->at(j).second = FILLED; + + waveStatusList[waveToBeDispatched->simdId]->at( + waveToBeDispatched->wfSlotId).second = BLOCKED; + + assert(computeUnit->readyList[j].size() == readyListSize - 1); + } + // arbitrate over all shared resources among instructions being issued + // simultaneously + arbitrate(); +} + +void +ScheduleStage::regStats() +{ +} diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh new file mode 100644 index 000000000..26eb9a25b --- /dev/null +++ b/src/gpu-compute/schedule_stage.hh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULE_STAGE_HH__ +#define __SCHEDULE_STAGE_HH__ + +#include +#include + +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/scheduler.hh" +#include "gpu-compute/scoreboard_check_stage.hh" + +// Schedule or execution arbitration stage. +// From the pool of ready waves in the ready list, +// one wave is selected for each execution resource. +// The selection is made based on a scheduling policy + +class ComputeUnit; +class Wavefront; + +struct ComputeUnitParams; + +class ScheduleStage +{ + public: + ScheduleStage(const ComputeUnitParams *params); + ~ScheduleStage(); + void init(ComputeUnit *cu); + void exec(); + void arbitrate(); + // Stats related variables and methods + std::string name() { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + uint32_t numSIMDs; + uint32_t numMemUnits; + + // Each execution resource will have its own + // scheduler and a dispatch list + std::vector scheduler; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList + std::vector>*> + waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + std::vector> *dispatchList; + + std::string _name; +}; + +#endif // __SCHEDULE_STAGE_HH__ diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc new file mode 100644 index 000000000..1cd0bfe55 --- /dev/null +++ b/src/gpu-compute/scheduler.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/scheduler.hh" + +Scheduler::Scheduler(const ComputeUnitParams *p) +{ + if (p->execPolicy == "OLDEST-FIRST") { + schedPolicy = SCHED_POLICY::OF_POLICY; + } else if (p->execPolicy == "ROUND-ROBIN") { + schedPolicy = SCHED_POLICY::RR_POLICY; + } else { + fatal("Unimplemented scheduling policy"); + } +} + +Wavefront* +Scheduler::chooseWave() +{ + if (schedPolicy == SCHED_POLICY::OF_POLICY) { + return OFSchedPolicy.chooseWave(); + } else if (schedPolicy == SCHED_POLICY::RR_POLICY) { + return RRSchedPolicy.chooseWave(); + } else { + fatal("Unimplemented scheduling policy"); + } +} + +void +Scheduler::bindList(std::vector *list) +{ + if (schedPolicy == SCHED_POLICY::OF_POLICY) { + OFSchedPolicy.bindList(list); + } else if (schedPolicy == SCHED_POLICY::RR_POLICY) { + RRSchedPolicy.bindList(list); + } else { + fatal("Unimplemented scheduling policy"); + } +} diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh new file mode 100644 index 000000000..148ec9425 --- /dev/null +++ b/src/gpu-compute/scheduler.hh @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULER_HH__ +#define __SCHEDULER_HH__ + +#include "gpu-compute/of_scheduling_policy.hh" +#include "gpu-compute/rr_scheduling_policy.hh" +#include "gpu-compute/scheduling_policy.hh" +#include "params/ComputeUnit.hh" + +enum SCHED_POLICY +{ + OF_POLICY = 0, + RR_POLICY +}; + +class Scheduler +{ + public: + Scheduler(const ComputeUnitParams *params); + Wavefront *chooseWave(); + void bindList(std::vector *list); + + private: + SCHED_POLICY schedPolicy; + SchedulingPolicy RRSchedPolicy; + SchedulingPolicy OFSchedPolicy; +}; + +#endif // __SCHEDULER_HH__ diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh new file mode 100644 index 000000000..b5e923c62 --- /dev/null +++ b/src/gpu-compute/scheduling_policy.hh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULING_POLICY_HH__ +#define __SCHEDULING_POLICY_HH__ + +#include + +template +class SchedulingPolicy +{ + public: + Wavefront* chooseWave() { return policyImpl.chooseWave(); } + + void + bindList(std::vector *list) + { + return policyImpl.bindList(list); + } + + private: + Impl policyImpl; +}; + +#endif // __SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc new file mode 100644 index 000000000..0d856a9b0 --- /dev/null +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/scoreboard_check_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "params/ComputeUnit.hh" + +ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p) + : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), + numGlbMemPipes(p->num_global_mem_pipes), + numShrMemPipes(p->num_shared_mem_pipes), + vectorAluInstAvail(nullptr), + lastGlbMemSimd(-1), + lastShrMemSimd(-1), glbMemInstAvail(nullptr), + shrMemInstAvail(nullptr) +{ +} + +ScoreboardCheckStage::~ScoreboardCheckStage() +{ + readyList.clear(); + waveStatusList.clear(); + shrMemInstAvail = nullptr; + glbMemInstAvail = nullptr; +} + +void +ScoreboardCheckStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScoreboardCheckStage"; + + for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + readyList.push_back(&computeUnit->readyList[unitId]); + } + + for (int unitId = 0; unitId < numSIMDs; ++unitId) { + waveStatusList.push_back(&computeUnit->waveStatusList[unitId]); + } + + vectorAluInstAvail = &computeUnit->vectorAluInstAvail; + glbMemInstAvail= &computeUnit->glbMemInstAvail; + shrMemInstAvail= &computeUnit->shrMemInstAvail; +} + +void +ScoreboardCheckStage::initStatistics() +{ + lastGlbMemSimd = -1; + lastShrMemSimd = -1; + *glbMemInstAvail = 0; + *shrMemInstAvail = 0; + + for (int unitId = 0; unitId < numSIMDs; ++unitId) + vectorAluInstAvail->at(unitId) = false; +} + +void +ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId) +{ + if (curWave->instructionBuffer.empty()) + return; + + // track which vector SIMD unit has at least one WV with a vector + // ALU as the oldest instruction in its Instruction buffer + vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) || + curWave->isOldestInstALU(); + + // track how many vector SIMD units have at least one WV with a + // vector Global memory instruction as the oldest instruction + // in its Instruction buffer + if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() || + curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId && + *glbMemInstAvail <= 1) { + (*glbMemInstAvail)++; + lastGlbMemSimd = unitId; + } + + // track how many vector SIMD units have at least one WV with a + // vector shared memory (LDS) instruction as the oldest instruction + // in its Instruction buffer + // TODO: parametrize the limit of the LDS units + if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) && + lastShrMemSimd != unitId) { + (*shrMemInstAvail)++; + lastShrMemSimd = unitId; + } +} + +void +ScoreboardCheckStage::exec() +{ + initStatistics(); + + // reset the ready list for all execution units; it will be + // constructed every cycle since resource availability may change + for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + readyList[unitId]->clear(); + } + + // iterate over the Wavefronts of all SIMD units + for (int unitId = 0; unitId < numSIMDs; ++unitId) { + for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) { + // reset the ready status of each wavefront + waveStatusList[unitId]->at(wvId).second = BLOCKED; + Wavefront *curWave = waveStatusList[unitId]->at(wvId).first; + collectStatistics(curWave, unitId); + + if (curWave->ready(Wavefront::I_ALU)) { + readyList[unitId]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_GLOBAL)) { + if (computeUnit->cedeSIMD(unitId, wvId)) { + continue; + } + + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_SHARED)) { + readyList[computeUnit->ShrMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_FLAT)) { + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_PRIVATE)) { + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } + } + } +} + +void +ScoreboardCheckStage::regStats() +{ +} diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh new file mode 100644 index 000000000..099597afb --- /dev/null +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCOREBOARD_CHECK_STAGE_HH__ +#define __SCOREBOARD_CHECK_STAGE_HH__ + +#include +#include +#include +#include + +class ComputeUnit; +class Wavefront; + +struct ComputeUnitParams; + +enum WAVE_STATUS +{ + BLOCKED = 0, + READY +}; + +/* + * Scoreboard check stage. + * All wavefronts are analyzed to see if they are ready + * to be executed this cycle. Both structural and data + * hazards are considered while marking a wave "ready" + * for execution. After analysis, the ready waves are + * added to readyList. + */ +class ScoreboardCheckStage +{ + public: + ScoreboardCheckStage(const ComputeUnitParams* params); + ~ScoreboardCheckStage(); + void init(ComputeUnit *cu); + void exec(); + + // Stats related variables and methods + const std::string& name() const { return _name; } + void regStats(); + + private: + void collectStatistics(Wavefront *curWave, int unitId); + void initStatistics(); + ComputeUnit *computeUnit; + uint32_t numSIMDs; + uint32_t numMemUnits; + uint32_t numGlbMemPipes; + uint32_t numShrMemPipes; + + // flag per vector SIMD unit that is set when there is at least one + // WF that has a vector ALU instruction as the oldest in its + // Instruction Buffer + std::vector *vectorAluInstAvail; + int lastGlbMemSimd; + int lastShrMemSimd; + + int *glbMemInstAvail; + int *shrMemInstAvail; + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list + std::vector*> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList + std::vector>*> + waveStatusList; + + std::string _name; +}; + +#endif // __SCOREBOARD_CHECK_STAGE_HH__ diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc new file mode 100644 index 000000000..e8d7946ff --- /dev/null +++ b/src/gpu-compute/shader.cc @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/shader.hh" + +#include + +#include "arch/x86/linux/linux.hh" +#include "base/chunk_generator.hh" +#include "debug/GPUDisp.hh" +#include "debug/GPUMem.hh" +#include "debug/HSAIL.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/packet.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "sim/sim_exit.hh" + +Shader::Shader(const Params *p) : SimObject(p), + clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), + cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), + hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), + separate_acquire_release(p->separate_acquire_release), coissue_return(1), + trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), + globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), + box_tick_cnt(0), start_tick_cnt(0) +{ + + cuList.resize(n_cu); + + for (int i = 0; i < n_cu; ++i) { + cuList[i] = p->CUs[i]; + assert(i == cuList[i]->cu_id); + cuList[i]->shader = this; + } +} + +Addr +Shader::mmap(int length) +{ + + Addr start; + + // round up length to the next page + length = roundUp(length, TheISA::PageBytes); + + if (X86Linux64::mmapGrowsDown()) { + DPRINTF(HSAIL, "GROWS DOWN"); + start = gpuTc->getProcessPtr()->mmap_end -length; + gpuTc->getProcessPtr()->mmap_end = start; + } else { + DPRINTF(HSAIL, "GROWS UP"); + start = gpuTc->getProcessPtr()->mmap_end; + gpuTc->getProcessPtr()->mmap_end += length; + + // assertion to make sure we don't overwrite the stack (it grows down) + assert(gpuTc->getProcessPtr()->mmap_end < + gpuTc->getProcessPtr()->stack_base - + gpuTc->getProcessPtr()->max_stack_size); + + } + + DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); + + gpuTc->getProcessPtr()->allocateMem(start,length); + + return start; +} + +void +Shader::init() +{ + // grab the threadContext of the thread running on the CPU + assert(cpuPointer); + gpuTc = cpuPointer->getContext(0); + assert(gpuTc); +} + +Shader::~Shader() +{ + for (int j = 0; j < n_cu; ++j) + delete cuList[j]; +} + +void +Shader::updateThreadContext(int tid) { + // thread context of the thread which dispatched work + assert(cpuPointer); + gpuTc = cpuPointer->getContext(tid); + assert(gpuTc); +} + +void +Shader::hostWakeUp(BaseCPU *cpu) { + if (cpuPointer == cpu) { + if (gpuTc->status() == ThreadContext::Suspended) + cpu->activateContext(gpuTc->threadId()); + } else { + //Make sure both dispatcher and shader are trying to + //wakeup same host. Hack here to enable kernel launch + //from multiple CPUs + panic("Dispatcher wants to wakeup a different host"); + } +} + +Shader* +ShaderParams::create() +{ + return new Shader(this); +} + +void +Shader::exec() +{ + tick_cnt = curTick(); + box_tick_cnt = curTick() - start_tick_cnt; + + // apply any scheduled adds + for (int i = 0; i < sa_n; ++i) { + if (sa_when[i] <= tick_cnt) { + *sa_val[i] += sa_x[i]; + sa_val.erase(sa_val.begin() + i); + sa_x.erase(sa_x.begin() + i); + sa_when.erase(sa_when.begin() + i); + --sa_n; + --i; + } + } + + // clock all of the cu's + for (int i = 0; i < n_cu; ++i) + cuList[i]->exec(); +} + +bool +Shader::dispatch_workgroups(NDRange *ndr) +{ + bool scheduledSomething = false; + int cuCount = 0; + int curCu = nextSchedCu; + + while (cuCount < n_cu) { + //Every time we try a CU, update nextSchedCu + nextSchedCu = (nextSchedCu + 1) % n_cu; + + // dispatch workgroup iff the following two conditions are met: + // (a) wg_rem is true - there are unassigned workgroups in the grid + // (b) there are enough free slots in cu cuList[i] for this wg + if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { + scheduledSomething = true; + DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); + + // ticks() member function translates cycles to simulation ticks. + if (!tickEvent.scheduled()) { + schedule(tickEvent, curTick() + this->ticks(1)); + } + + cuList[curCu]->StartWorkgroup(ndr); + ndr->wgId[0]++; + ndr->globalWgId++; + if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { + ndr->wgId[0] = 0; + ndr->wgId[1]++; + + if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { + ndr->wgId[1] = 0; + ndr->wgId[2]++; + + if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { + ndr->wg_disp_rem = false; + break; + } + } + } + } + + ++cuCount; + curCu = nextSchedCu; + } + + return scheduledSomething; +} + +void +Shader::handshake(GpuDispatcher *_dispatcher) +{ + dispatcher = _dispatcher; +} + +void +Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, + bool suppress_func_errors, int cu_id) +{ + unsigned block_size = RubySystem::getBlockSizeBytes(); + unsigned size = req->getSize(); + + Addr tmp_addr; + BaseTLB::Mode trans_mode; + + if (cmd == MemCmd::ReadReq) { + trans_mode = BaseTLB::Read; + } else if (cmd == MemCmd::WriteReq) { + trans_mode = BaseTLB::Write; + } else { + fatal("unexcepted MemCmd\n"); + } + + tmp_addr = req->getVaddr(); + Addr split_addr = roundDown(tmp_addr + size - 1, block_size); + + assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size); + + // Misaligned access + if (split_addr > tmp_addr) { + RequestPtr req1, req2; + req->splitOnVaddr(split_addr, req1, req2); + + + PacketPtr pkt1 = new Packet(req2, cmd); + PacketPtr pkt2 = new Packet(req1, cmd); + + functionalTLBAccess(pkt1, cu_id, trans_mode); + functionalTLBAccess(pkt2, cu_id, trans_mode); + + PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); + PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); + + new_pkt1->dataStatic(data); + new_pkt2->dataStatic((uint8_t*)data + req1->getSize()); + + if (suppress_func_errors) { + new_pkt1->setSuppressFuncError(); + new_pkt2->setSuppressFuncError(); + } + + // fixme: this should be cuList[cu_id] if cu_id != n_cu + // The latter requires a memPort in the dispatcher + cuList[0]->memPort[0]->sendFunctional(new_pkt1); + cuList[0]->memPort[0]->sendFunctional(new_pkt2); + + delete new_pkt1; + delete new_pkt2; + delete pkt1; + delete pkt2; + } else { + PacketPtr pkt = new Packet(req, cmd); + functionalTLBAccess(pkt, cu_id, trans_mode); + PacketPtr new_pkt = new Packet(pkt->req, cmd); + new_pkt->dataStatic(data); + + if (suppress_func_errors) { + new_pkt->setSuppressFuncError(); + }; + + // fixme: this should be cuList[cu_id] if cu_id != n_cu + // The latter requires a memPort in the dispatcher + cuList[0]->memPort[0]->sendFunctional(new_pkt); + + delete new_pkt; + delete pkt; + } +} + +bool +Shader::busy() +{ + for (int i_cu = 0; i_cu < n_cu; ++i_cu) { + if (!cuList[i_cu]->isDone()) { + return true; + } + } + + return false; +} + +void +Shader::ScheduleAdd(uint32_t *val,Tick when,int x) +{ + sa_val.push_back(val); + sa_when.push_back(tick_cnt + when); + sa_x.push_back(x); + ++sa_n; +} + +Shader::TickEvent::TickEvent(Shader *_shader) + : Event(CPU_Tick_Pri), shader(_shader) +{ +} + + +void +Shader::TickEvent::process() +{ + if (shader->busy()) { + shader->exec(); + shader->schedule(this, curTick() + shader->ticks(1)); + } +} + +const char* +Shader::TickEvent::description() const +{ + return "Shader tick"; +} + +void +Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + MemCmd cmd, bool suppress_func_errors) +{ + uint8_t *data_buf = (uint8_t*)ptr; + + for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes()); + !gen.done(); gen.next()) { + Request *req = new Request(0, gen.addr(), gen.size(), 0, + cuList[0]->masterId(), 0, 0, 0); + + doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); + data_buf += gen.size(); + delete req; + } +} + +void +Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); +} + +void +Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + bool suppress_func_errors) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); +} + +void +Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); +} + +void +Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + bool suppress_func_errors) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, + suppress_func_errors); +} + +/* + * Send a packet through the appropriate TLB functional port. + * If cu_id=n_cu, then this is the dispatcher's TLB. + * Otherwise it's the TLB of the cu_id compute unit. + */ +void +Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) +{ + // update senderState. Need to know the gpuTc and the TLB mode + pkt->senderState = + new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); + + if (cu_id == n_cu) { + dispatcher->tlbPort->sendFunctional(pkt); + } else { + // even when the perLaneTLB flag is turned on + // it's ok tp send all accesses through lane 0 + // since the lane # is not known here, + // This isn't important since these are functional accesses. + cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); + } + + /* safe_cast the senderState */ + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete pkt->senderState; +} diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh new file mode 100644 index 000000000..91ea8aae0 --- /dev/null +++ b/src/gpu-compute/shader.hh @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __SHADER_HH__ +#define __SHADER_HH__ + +#include +#include + +#include "arch/isa.hh" +#include "arch/isa_traits.hh" +#include "base/types.hh" +#include "cpu/simple/atomic.hh" +#include "cpu/simple/timing.hh" +#include "cpu/simple_thread.hh" +#include "cpu/thread_context.hh" +#include "cpu/thread_state.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_tlb.hh" +#include "gpu-compute/lds_state.hh" +#include "gpu-compute/qstruct.hh" +#include "mem/page_table.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/Shader.hh" +#include "sim/faults.hh" +#include "sim/process.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class GpuDispatcher; + +namespace TheISA +{ + class GpuTLB; +} + +static const int LDS_SIZE = 65536; + +// Class Shader: This describes a single shader instance. Most +// configurations will only have a single shader. + +class Shader : public SimObject +{ + protected: + // Shader's clock period in terms of number of ticks of curTime, + // aka global simulation clock + Tick clock; + + public: + typedef ShaderParams Params; + enum hsail_mode_e {SIMT,VECTOR_SCALAR}; + + // clock related functions ; maps to-and-from + // Simulation ticks and shader clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } + + Tick getClock() const { return clock; } + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + + SimpleThread *cpuThread; + ThreadContext *gpuTc; + BaseCPU *cpuPointer; + + class TickEvent : public Event + { + private: + Shader *shader; + + public: + TickEvent(Shader*); + void process(); + const char* description() const; + }; + + TickEvent tickEvent; + + // is this simulation going to be timing mode in the memory? + bool timingSim; + hsail_mode_e hsail_mode; + + // If set, issue acq packet @ kernel launch + int impl_kern_boundary_sync; + // If set, generate a separate packet for acquire/release on + // ld_acquire/st_release/atomic operations + int separate_acquire_release; + // If set, fetch returns may be coissued with instructions + int coissue_return; + // If set, always dump all 64 gprs to trace + int trace_vgpr_all; + // Number of cu units in the shader + int n_cu; + // Number of wavefront slots per cu + int n_wf; + // The size of global memory + int globalMemSize; + + /* + * Bytes/work-item for call instruction + * The number of arguments for an hsail function will + * vary. We simply determine the maximum # of arguments + * required by any hsail function up front before the + * simulation (during parsing of the Brig) and record + * that number here. + */ + int funcargs_size; + + // Tracks CU that rr dispatcher should attempt scheduling + int nextSchedCu; + + // Size of scheduled add queue + uint32_t sa_n; + + // Pointer to value to be increments + std::vector sa_val; + // When to do the increment + std::vector sa_when; + // Amount to increment by + std::vector sa_x; + + // List of Compute Units (CU's) + std::vector cuList; + + uint64_t tick_cnt; + uint64_t box_tick_cnt; + uint64_t start_tick_cnt; + + GpuDispatcher *dispatcher; + + Shader(const Params *p); + ~Shader(); + virtual void init(); + + // Run shader + void exec(); + + // Check to see if shader is busy + bool busy(); + + // Schedule a 32-bit value to be incremented some time in the future + void ScheduleAdd(uint32_t *val, Tick when, int x); + bool processTimingPacket(PacketPtr pkt); + + void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + MemCmd cmd, bool suppress_func_errors); + + void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); + + void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, + bool suppress_func_errors); + + void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); + + void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, + bool suppress_func_errors); + + void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, + bool suppress_func_errors, int cu_id); + + void + registerCU(int cu_id, ComputeUnit *compute_unit) + { + cuList[cu_id] = compute_unit; + } + + void handshake(GpuDispatcher *dispatcher); + bool dispatch_workgroups(NDRange *ndr); + Addr mmap(int length); + void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); + void updateThreadContext(int tid); + void hostWakeUp(BaseCPU *cpu); +}; + +#endif // __SHADER_HH__ diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc new file mode 100644 index 000000000..0e35ab9cc --- /dev/null +++ b/src/gpu-compute/simple_pool_manager.cc @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/simple_pool_manager.hh" + +#include "base/misc.hh" + +// return the min number of elements that the manager can reserve given +// a request for "size" elements +uint32_t +SimplePoolManager::minAllocatedElements(uint32_t size) +{ + fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n", + size); + + return size % minAllocation() > 0 ? + (minAllocation() - (size % minAllocation())) + size : size; +} + +std::string +SimplePoolManager::printRegion() +{ + std::string _cout; + if (_reservedGroups == 0) + _cout = "VRF is empty\n"; + else if (_reservedGroups > 0) { + uint32_t reservedEntries = _reservedGroups * _regionSize; + _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n"; + } + + return _cout; +} + +bool +SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size) +{ + assert(numRegions * minAllocatedElements(size) <= poolSize()); + + return _reservedGroups == 0; +} + +void +SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx) +{ + assert(_reservedGroups > 0); + --_reservedGroups; + + if (!_reservedGroups) + _nxtFreeIdx = 0; +} + +uint32_t +SimplePoolManager::allocateRegion(const uint32_t size, + uint32_t *reservedPoolSize) +{ + uint32_t actualSize = minAllocatedElements(size); + uint32_t startIdx = _nxtFreeIdx; + _nxtFreeIdx += actualSize; + _regionSize = actualSize; + assert(_nxtFreeIdx < poolSize()); + *reservedPoolSize = actualSize; + ++_reservedGroups; + + return startIdx; +} + +uint32_t +SimplePoolManager::regionSize(std::pair ®ion) +{ + bool wrapAround = (region.first > region.second); + if (!wrapAround) { + return region.second - region.first + 1; + } else { + return region.second + poolSize() - region.first + 1; + } +} diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh new file mode 100644 index 000000000..1d4174da8 --- /dev/null +++ b/src/gpu-compute/simple_pool_manager.hh @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __SIMPLE_POOL_MANAGER_HH__ +#define __SIMPLE_POOL_MANAGER_HH__ + +#include +#include + +#include "gpu-compute/pool_manager.hh" + +// Simple Pool Manager: allows one region per pool. No region merging is +// supported. +class SimplePoolManager : public PoolManager +{ + public: + SimplePoolManager(uint32_t minAlloc, uint32_t poolSize) + : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0), + _reservedGroups(0) + { + } + + uint32_t minAllocatedElements(uint32_t size); + std::string printRegion(); + bool canAllocate(uint32_t numRegions, uint32_t size); + uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize); + void freeRegion(uint32_t firstIdx, uint32_t lastIdx); + uint32_t regionSize(std::pair ®ion); + + private: + // actual size of a region (normalized to the minimum size that can + // be reserved) + uint32_t _regionSize; + // next index to allocate a region + uint8_t _nxtFreeIdx; + // number of groups that reserve a region + uint32_t _reservedGroups; +}; + +#endif // __SIMPLE_POOL_MANAGER_HH__ diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc new file mode 100644 index 000000000..835d7b740 --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.cc @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/tlb_coalescer.hh" + +#include + +#include "debug/GPUTLB.hh" + +TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), + clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), + coalescingWindow(p->coalescingWindow), + disableCoalescing(p->disableCoalescing), probeTLBEvent(this), + cleanupEvent(this) +{ + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } +} + +BaseSlavePort& +TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) +{ + if (if_name == "slave") { + if (idx >= static_cast(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } +} + +BaseMasterPort& +TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "master") { + if (idx >= static_cast(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } +} + +/* + * This method returns true if the + * can be coalesced with and false otherwise. + * A given set of rules is checked. + * The rules can potentially be modified based on the TLB level. + */ +bool +TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) +{ + if (disableCoalescing) + return false; + + TheISA::GpuTLB::TranslationState *incoming_state = + safe_cast(incoming_pkt->senderState); + + TheISA::GpuTLB::TranslationState *coalesced_state = + safe_cast(coalesced_pkt->senderState); + + // Rule 1: Coalesce requests only if they + // fall within the same virtual page + Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), + TheISA::PageBytes); + + Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), + TheISA::PageBytes); + + if (incoming_virt_page_addr != coalesced_virt_page_addr) + return false; + + //* Rule 2: Coalesce requests only if they + // share a TLB Mode, i.e. they are both read + // or write requests. + BaseTLB::Mode incoming_mode = incoming_state->tlbMode; + BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; + + if (incoming_mode != coalesced_mode) + return false; + + // when we can coalesce a packet update the reqCnt + // that is the number of packets represented by + // this coalesced packet + if (!incoming_state->prefetch) + coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); + + return true; +} + +/* + * We need to update the physical addresses of all the translation requests + * that were coalesced into the one that just returned. + */ +void +TLBCoalescer::updatePhysAddresses(PacketPtr pkt) +{ + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + + DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", + issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; + assert(tlb_entry); + Addr first_entry_vaddr = tlb_entry->vaddr; + Addr first_entry_paddr = tlb_entry->paddr; + int page_size = tlb_entry->size(); + bool uncacheable = tlb_entry->uncacheable; + int first_hit_level = sender_state->hitLevel; + bool valid = tlb_entry->valid; + + // Get the physical page address of the translated request + // Using the page_size specified in the TLBEntry allows us + // to support different page sizes. + Addr phys_page_paddr = pkt->req->getPaddr(); + phys_page_paddr &= ~(page_size - 1); + + for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { + PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast( + local_pkt->senderState); + + // we are sending the packet back, so pop the reqCnt associated + // with this level in the TLB hiearchy + if (!sender_state->prefetch) + sender_state->reqCnt.pop_back(); + + /* + * Only the first packet from this coalesced request has been + * translated. Grab the translated phys. page addr and update the + * physical addresses of the remaining packets with the appropriate + * page offsets. + */ + if (i) { + Addr paddr = phys_page_paddr; + paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); + local_pkt->req->setPaddr(paddr); + + if (uncacheable) + local_pkt->req->setFlags(Request::UNCACHEABLE); + + // update senderState->tlbEntry, so we can insert + // the correct TLBEentry in the TLBs above. + sender_state->tlbEntry = + new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, + valid); + + // update the hitLevel for all uncoalesced reqs + // so that each packet knows where it hit + // (used for statistics in the CUs) + sender_state->hitLevel = first_hit_level; + } + + SlavePort *return_port = sender_state->ports.back(); + sender_state->ports.pop_back(); + + // Translation is done - Convert to a response pkt if necessary and + // send the translation back + if (local_pkt->isRequest()) { + local_pkt->makeTimingResponse(); + } + + return_port->sendTimingResp(local_pkt); + } + + // schedule clean up for end of this cycle + // This is a maximum priority event and must be on + // the same cycle as GPUTLB cleanup event to prevent + // race conditions with an IssueProbeEvent caused by + // MemSidePort::recvReqRetry + cleanupQueue.push(virt_page_addr); + + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); +} + +// Receive translation requests, create a coalesced request, +// and send them to the TLB (TLBProbesPerCycle) +bool +TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) +{ + // first packet of a coalesced request + PacketPtr first_packet = nullptr; + // true if we are able to do coalescing + bool didCoalesce = false; + // number of coalesced reqs for a given window + int coalescedReq_cnt = 0; + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + // push back the port to remember the path back + sender_state->ports.push_back(this); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) { + // if reqCnt is empty then this packet does not represent + // multiple uncoalesced reqs(pkts) but just a single pkt. + // If it does though then the reqCnt for each level in the + // hierarchy accumulates the total number of reqs this packet + // represents + int req_cnt = 1; + + if (!sender_state->reqCnt.empty()) + req_cnt = sender_state->reqCnt.back(); + + sender_state->reqCnt.push_back(req_cnt); + + // update statistics + coalescer->uncoalescedAccesses++; + req_cnt = sender_state->reqCnt.back(); + DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); + coalescer->queuingCycles -= (curTick() * req_cnt); + coalescer->localqueuingCycles -= curTick(); + } + + // FIXME if you want to coalesce not based on the issueTime + // of the packets (i.e., from the compute unit's perspective) + // but based on when they reached this coalescer then + // remove the following if statement and use curTick() or + // coalescingWindow for the tick_index. + if (!sender_state->issueTime) + sender_state->issueTime = curTick(); + + // The tick index is used as a key to the coalescerFIFO hashmap. + // It is shared by all candidates that fall within the + // given coalescingWindow. + int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; + + if (coalescer->coalescerFIFO.count(tick_index)) { + coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); + } + + // see if we can coalesce the incoming pkt with another + // coalesced request with the same tick_index + for (int i = 0; i < coalescedReq_cnt; ++i) { + first_packet = coalescer->coalescerFIFO[tick_index][i][0]; + + if (coalescer->canCoalesce(pkt, first_packet)) { + coalescer->coalescerFIFO[tick_index][i].push_back(pkt); + + DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", + i, tick_index, + coalescer->coalescerFIFO[tick_index][i].size()); + + didCoalesce = true; + break; + } + } + + // if this is the first request for this tick_index + // or we did not manage to coalesce, update stats + // and make necessary allocations. + if (!coalescedReq_cnt || !didCoalesce) { + if (update_stats) + coalescer->coalescedAccesses++; + + std::vector new_array; + new_array.push_back(pkt); + coalescer->coalescerFIFO[tick_index].push_back(new_array); + + DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " + "push\n", tick_index, + coalescer->coalescerFIFO[tick_index].size()); + } + + //schedule probeTLBEvent next cycle to send the + //coalesced requests to the TLB + if (!coalescer->probeTLBEvent.scheduled()) { + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); + } + + return true; +} + +void +TLBCoalescer::CpuSidePort::recvReqRetry() +{ + assert(false); +} + +void +TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) +{ + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) + coalescer->uncoalescedAccesses++; + + // If there is a pending timing request for this virtual address + // print a warning message. This is a temporary caveat of + // the current simulator where atomic and timing requests can + // coexist. FIXME remove this check/warning in the future. + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (map_count) { + DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " + "req. pending\n", virt_page_addr); + } + + coalescer->memSidePort[0]->sendFunctional(pkt); +} + +AddrRangeList +TLBCoalescer::CpuSidePort::getAddrRanges() const +{ + // currently not checked by the master + AddrRangeList ranges; + + return ranges; +} + +bool +TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) +{ + // a translation completed and returned + coalescer->updatePhysAddresses(pkt); + + return true; +} + +void +TLBCoalescer::MemSidePort::recvReqRetry() +{ + //we've receeived a retry. Schedule a probeTLBEvent + if (!coalescer->probeTLBEvent.scheduled()) + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); +} + +void +TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); +} + +TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer) + : Event(CPU_Tick_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::IssueProbeEvent::description() const +{ + return "Probe the TLB below"; +} + +/* + * Here we scan the coalescer FIFO and issue the max + * number of permitted probes to the TLB below. We + * permit bypassing of coalesced requests for the same + * tick_index. + * + * We do not access the next tick_index unless we've + * drained the previous one. The coalesced requests + * that are successfully sent are moved to the + * issuedTranslationsTable table (the table which keeps + * track of the outstanding reqs) + */ +void +TLBCoalescer::IssueProbeEvent::process() +{ + // number of TLB probes sent so far + int sent_probes = 0; + // rejected denotes a blocking event + bool rejected = false; + + // It is set to true either when the recvTiming of the TLB below + // returns false or when there is another outstanding request for the + // same virt. page. + + DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n"); + + for (auto iter = coalescer->coalescerFIFO.begin(); + iter != coalescer->coalescerFIFO.end() && !rejected; ) { + int coalescedReq_cnt = iter->second.size(); + int i = 0; + int vector_index = 0; + + DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", + coalescedReq_cnt, iter->first); + + while (i < coalescedReq_cnt) { + ++i; + PacketPtr first_packet = iter->second[vector_index][0]; + + // compute virtual page address for this request + Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), + TheISA::PageBytes); + + // is there another outstanding request for the same page addr? + int pending_reqs = + coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (pending_reqs) { + DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " + "page %#x\n", virt_page_addr); + + ++vector_index; + rejected = true; + + continue; + } + + // send the coalesced request for virt_page_addr + if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) { + DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", + virt_page_addr); + + // No need for a retries queue since we are already buffering + // the coalesced request in coalescerFIFO. + rejected = true; + ++vector_index; + } else { + TheISA::GpuTLB::TranslationState *tmp_sender_state = + safe_cast + (first_packet->senderState); + + bool update_stats = !tmp_sender_state->prefetch; + + if (update_stats) { + // req_cnt is total number of packets represented + // by the one we just sent counting all the way from + // the top of TLB hiearchy (i.e., from the CU) + int req_cnt = tmp_sender_state->reqCnt.back(); + coalescer->queuingCycles += (curTick() * req_cnt); + + DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", + coalescer->name(), req_cnt); + + // pkt_cnt is number of packets we coalesced into the one + // we just sent but only at this coalescer level + int pkt_cnt = iter->second[vector_index].size(); + coalescer->localqueuingCycles += (curTick() * pkt_cnt); + } + + DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", + virt_page_addr); + + //copy coalescedReq to issuedTranslationsTable + coalescer->issuedTranslationsTable[virt_page_addr] + = iter->second[vector_index]; + + //erase the entry of this coalesced req + iter->second.erase(iter->second.begin() + vector_index); + + if (iter->second.empty()) + assert(i == coalescedReq_cnt); + + sent_probes++; + if (sent_probes == coalescer->TLBProbesPerCycle) + return; + } + } + + //if there are no more coalesced reqs for this tick_index + //erase the hash_map with the first iterator + if (iter->second.empty()) { + coalescer->coalescerFIFO.erase(iter++); + } else { + ++iter; + } + } +} + +TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer) + : Event(Maximum_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::CleanupEvent::description() const +{ + return "Cleanup issuedTranslationsTable hashmap"; +} + +void +TLBCoalescer::CleanupEvent::process() +{ + while (!coalescer->cleanupQueue.empty()) { + Addr cleanup_addr = coalescer->cleanupQueue.front(); + coalescer->cleanupQueue.pop(); + coalescer->issuedTranslationsTable.erase(cleanup_addr); + + DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", + cleanup_addr); + } +} + +void +TLBCoalescer::regStats() +{ + uncoalescedAccesses + .name(name() + ".uncoalesced_accesses") + .desc("Number of uncoalesced TLB accesses") + ; + + coalescedAccesses + .name(name() + ".coalesced_accesses") + .desc("Number of coalesced TLB accesses") + ; + + queuingCycles + .name(name() + ".queuing_cycles") + .desc("Number of cycles spent in queue") + ; + + localqueuingCycles + .name(name() + ".local_queuing_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over all incoming pkts") + ; + + localLatency = localqueuingCycles / uncoalescedAccesses; +} + + +TLBCoalescer* +TLBCoalescerParams::create() +{ + return new TLBCoalescer(this); +} + diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh new file mode 100644 index 000000000..09210148b --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.hh @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __TLB_COALESCER_HH__ +#define __TLB_COALESCER_HH__ + +#include +#include +#include +#include + +#include "arch/generic/tlb.hh" +#include "arch/isa.hh" +#include "arch/isa_traits.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/regs/segment.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/gpu_tlb.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/TLBCoalescer.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +/** + * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of + * each TLB. It receives packets and issues coalesced requests to the + * TLB below it. It controls how requests are coalesced (the rules) + * and the permitted number of TLB probes per cycle (i.e., how many + * coalesced requests it feeds the TLB per cycle). + */ +class TLBCoalescer : public MemObject +{ + protected: + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the + // python config files. + int clock; + + public: + typedef TLBCoalescerParams Params; + TLBCoalescer(const Params *p); + ~TLBCoalescer() { } + + // Number of TLB probes per cycle. Parameterizable - default 2. + int TLBProbesPerCycle; + + // Consider coalescing across that many ticks. + // Paraemterizable - default 1. + int coalescingWindow; + + // Each coalesced request consists of multiple packets + // that all fall within the same virtual page + typedef std::vector coalescedReq; + + // disables coalescing when true + bool disableCoalescing; + + /* + * This is a hash map with as a key. + * It contains a vector of coalescedReqs per . + * Requests are buffered here until they can be issued to + * the TLB, at which point they are copied to the + * issuedTranslationsTable hash map. + * + * In terms of coalescing, we coalesce requests in a given + * window of x cycles by using tick_index = issueTime/x as a + * key, where x = coalescingWindow. issueTime is the issueTime + * of the pkt from the ComputeUnit's perspective, but another + * option is to change it to curTick(), so we coalesce based + * on the receive time. + */ + typedef std::unordered_map> CoalescingFIFO; + + CoalescingFIFO coalescerFIFO; + + /* + * issuedTranslationsTabler: a hash_map indexed by virtual page + * address. Each hash_map entry has a vector of PacketPtr associated + * with it denoting the different packets that share an outstanding + * coalesced translation request for the same virtual page. + * + * The rules that determine which requests we can coalesce are + * specified in the canCoalesce() method. + */ + typedef std::unordered_map CoalescingTable; + + CoalescingTable issuedTranslationsTable; + + // number of packets the coalescer receives + Stats::Scalar uncoalescedAccesses; + // number packets the coalescer send to the TLB + Stats::Scalar coalescedAccesses; + + // Number of cycles the coalesced requests spend waiting in + // coalescerFIFO. For each packet the coalescer receives we take into + // account the number of all uncoalesced requests this pkt "represents" + Stats::Scalar queuingCycles; + + // On average how much time a request from the + // uncoalescedAccesses that reaches the TLB + // spends waiting? + Stats::Scalar localqueuingCycles; + // localqueuingCycles/uncoalescedAccesses + Stats::Formula localLatency; + + bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2); + void updatePhysAddresses(PacketPtr pkt); + void regStats(); + + // Clock related functions. Maps to-and-from + // Simulation ticks and object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer, + PortID _index) + : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer), + index(_index) { } + + protected: + TLBCoalescer *coalescer; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + recvRespRetry() + { + fatal("recvRespRetry() is not implemented in the TLB coalescer.\n"); + } + + virtual AddrRangeList getAddrRanges() const; + }; + + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer, + PortID _index) + : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer), + index(_index) { } + + std::deque retries; + + protected: + TLBCoalescer *coalescer; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + recvRespRetry() + { + fatal("recvRespRetry() not implemented in TLB coalescer"); + } + }; + + // Coalescer slave ports on the cpu Side + std::vector cpuSidePort; + // Coalescer master ports on the memory side + std::vector memSidePort; + + BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx); + BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx); + + class IssueProbeEvent : public Event + { + private: + TLBCoalescer *coalescer; + + public: + IssueProbeEvent(TLBCoalescer *_coalescer); + void process(); + const char *description() const; + }; + + // this event issues the TLB probes + IssueProbeEvent probeTLBEvent; + + // the cleanupEvent is scheduled after a TLBEvent triggers + // in order to free memory and do the required clean-up + class CleanupEvent : public Event + { + private: + TLBCoalescer *coalescer; + + public: + CleanupEvent(TLBCoalescer *_coalescer); + void process(); + const char* description() const; + }; + + // schedule cleanup + CleanupEvent cleanupEvent; + + // this FIFO queue keeps track of the virt. page + // addresses that are pending cleanup + std::queue cleanupQueue; +}; + +#endif // __TLB_COALESCER_HH__ diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc new file mode 100644 index 000000000..8b7dc0691 --- /dev/null +++ b/src/gpu-compute/vector_register_file.cc @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/vector_register_file.hh" + +#include + +#include "base/misc.hh" +#include "gpu-compute/code_enums.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" +#include "gpu-compute/wavefront.hh" +#include "params/VectorRegisterFile.hh" + +VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p) + : SimObject(p), + manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)), + simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd), + vgprState(new VecRegisterState()) +{ + fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n"); + fatal_if(simdId < 0, "Illegal SIMD id for VRF"); + + fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not " + "multiple of VRF size\n"); + + busy.clear(); + busy.resize(numRegsPerSimd, 0); + nxtBusy.clear(); + nxtBusy.resize(numRegsPerSimd, 0); + + vgprState->init(numRegsPerSimd); +} + +void +VectorRegisterFile::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + vgprState->setParent(computeUnit); +} + +uint8_t +VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const +{ + uint8_t status = nxtBusy.at(idx); + + if (operandSize > 4) { + status = status | (nxtBusy.at((idx + 1) % numRegs())); + } + + return status; +} + +uint8_t +VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const +{ + uint8_t status = busy.at(idx); + + if (operandSize > 4) { + status = status | (busy.at((idx + 1) % numRegs())); + } + + return status; +} + +void +VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value) +{ + nxtBusy.at(regIdx) = value; + + if (operandSize > 4) { + nxtBusy.at((regIdx + 1) % numRegs()) = value; + } +} + +void +VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value) +{ + busy.at(regIdx) = value; + + if (operandSize > 4) { + busy.at((regIdx + 1) % numRegs()) = value; + } +} + +bool +VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const +{ + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i)) { + uint32_t vgprIdx = ii->getRegisterIndex(i); + uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1); + + if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + w->numTimesBlockedDueRAWDependencies++; + } + + return false; + } + + if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + w->numTimesBlockedDueRAWDependencies++; + } + + return false; + } + } + } + + return true; +} + +void +VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) +{ + bool loadInstr = IS_OT_READ(ii->opType()); + bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + + bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); + + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + uint32_t physReg = w->remap(ii->getRegisterIndex(i), + ii->getOperandSize(i), 1); + + // mark the destination vector register as busy + markReg(physReg, ii->getOperandSize(i), 1); + // clear the in-flight status of the destination vector register + preMarkReg(physReg, ii->getOperandSize(i), 0); + + // FIXME: if we ever model correct timing behavior + // for load argument instructions then we should not + // set the destination register as busy now but when + // the data returns. Loads and Atomics should free + // their destination registers when the data returns, + // not now + if (!atomicInstr && !loadNoArgInstr) { + uint32_t pipeLen = ii->getOperandSize(i) <= 4 ? + computeUnit->spBypassLength() : + computeUnit->dpBypassLength(); + + // schedule an event for marking the register as ready + computeUnit->registerEvent(w->simdId, physReg, + ii->getOperandSize(i), + computeUnit->shader->tick_cnt + + computeUnit->shader->ticks(pipeLen), + 0); + } + } + } +} + +int +VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w, + std::vector ®Vec, uint32_t operandSize, + uint64_t timestamp) +{ + int delay = 0; + + panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n", + regVec.size()); + + for (int i = 0; i < regVec.size(); ++i) { + // mark the destination VGPR as free when the timestamp expires + computeUnit->registerEvent(w->simdId, regVec[i], operandSize, + computeUnit->shader->tick_cnt + timestamp + + computeUnit->shader->ticks(delay), 0); + } + + return delay; +} + +void +VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii) +{ + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + uint32_t physReg = w->remap(ii->getRegisterIndex(i), + ii->getOperandSize(i), 1); + // set the in-flight status of the destination vector register + preMarkReg(physReg, ii->getOperandSize(i), 1); + } + } +} + +bool +VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, + GPUDynInstPtr ii, + VrfAccessType accessType) +{ + bool ready = true; + + return ready; +} + +bool +VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, + VrfAccessType accessType) +{ + bool ready = true; + + return ready; +} + +VectorRegisterFile* +VectorRegisterFileParams::create() +{ + return new VectorRegisterFile(this); +} diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh new file mode 100644 index 000000000..1cb011a1e --- /dev/null +++ b/src/gpu-compute/vector_register_file.hh @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __VECTOR_REGISTER_FILE_HH__ +#define __VECTOR_REGISTER_FILE_HH__ + +#include + +#include "base/statistics.hh" +#include "base/types.hh" +#include "gpu-compute/vector_register_state.hh" +#include "sim/sim_object.hh" + +class ComputeUnit; +class Shader; +class SimplePoolManager; +class Wavefront; + +struct VectorRegisterFileParams; + +enum class VrfAccessType : uint8_t +{ + READ = 0x01, + WRITE = 0x02, + RD_WR = READ | WRITE +}; + +// Vector Register File +class VectorRegisterFile : public SimObject +{ + public: + VectorRegisterFile(const VectorRegisterFileParams *p); + + void setParent(ComputeUnit *_computeUnit); + + // Read a register + template + T + read(int regIdx, int threadId=0) + { + T p0 = vgprState->read(regIdx, threadId); + + return p0; + } + + // Write a register + template + void + write(int regIdx, T value, int threadId=0) + { + vgprState->write(regIdx, value, threadId); + } + + uint8_t regBusy(int idx, uint32_t operandSize) const; + uint8_t regNxtBusy(int idx, uint32_t operandSize) const; + + int numRegs() const { return numRegsPerSimd; } + + void markReg(int regIdx, uint32_t operandSize, uint8_t value); + void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value); + + virtual void exec(GPUDynInstPtr ii, Wavefront *w); + + virtual int exec(uint64_t dynamic_id, Wavefront *w, + std::vector ®Vec, uint32_t operandSize, + uint64_t timestamp); + + bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const; + virtual void updateEvents() { } + virtual void updateResources(Wavefront *w, GPUDynInstPtr ii); + + virtual bool + isReadConflict(int memWfId, int exeWfId) const + { + return false; + } + + virtual bool + isWriteConflict(int memWfId, int exeWfId) const + { + return false; + } + + virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, + GPUDynInstPtr ii, + VrfAccessType accessType); + + virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, + VrfAccessType accessType); + + SimplePoolManager *manager; + + protected: + ComputeUnit* computeUnit; + int simdId; + + // flag indicating if a register is busy + std::vector busy; + // flag indicating if a register will be busy (by instructions + // in the SIMD pipeline) + std::vector nxtBusy; + + // numer of registers (bank size) per simd unit (bank) + int numRegsPerSimd; + + // vector register state + VecRegisterState *vgprState; +}; + +#endif // __VECTOR_REGISTER_FILE_HH__ diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc new file mode 100644 index 000000000..f231b0579 --- /dev/null +++ b/src/gpu-compute/vector_register_state.cc @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/vector_register_state.hh" + +#include "gpu-compute/compute_unit.hh" + +VecRegisterState::VecRegisterState() : computeUnit(nullptr) +{ + s_reg.clear(); + d_reg.clear(); +} + +void +VecRegisterState::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + _name = computeUnit->name() + ".VecRegState"; +} + +void +VecRegisterState::init(uint32_t _size) +{ + s_reg.resize(_size); + d_reg.resize(_size); +} diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh new file mode 100644 index 000000000..a233b9acc --- /dev/null +++ b/src/gpu-compute/vector_register_state.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __VECTOR_REGISTER_STATE_HH__ +#define __VECTOR_REGISTER_STATE_HH__ + +#include +#include +#include +#include + +#include "gpu-compute/misc.hh" + +class ComputeUnit; + +// Vector Register State per SIMD unit (contents of the vector +// registers in the VRF of the SIMD) +class VecRegisterState +{ + public: + VecRegisterState(); + void init(uint32_t _size); + + const std::string& name() const { return _name; } + void setParent(ComputeUnit *_computeUnit); + void regStats() { } + + // Access methods + template + T + read(int regIdx, int threadId=0) { + T *p0; + assert(sizeof(T) == 4 || sizeof(T) == 8); + if (sizeof(T) == 4) { + p0 = (T*)(&s_reg[regIdx][threadId]); + } else { + p0 = (T*)(&d_reg[regIdx][threadId]); + } + + return *p0; + } + + template + void + write(unsigned int regIdx, T value, int threadId=0) { + T *p0; + assert(sizeof(T) == 4 || sizeof(T) == 8); + if (sizeof(T) == 4) { + p0 = (T*)(&s_reg[regIdx][threadId]); + } else { + p0 = (T*)(&d_reg[regIdx][threadId]); + } + + *p0 = value; + } + + // (Single Precision) Vector Register File size. + int regSize() { return s_reg.size(); } + + private: + ComputeUnit *computeUnit; + std::string _name; + // 32-bit Single Precision Vector Register State + std::vector> s_reg; + // 64-bit Double Precision Vector Register State + std::vector> d_reg; +}; + +#endif // __VECTOR_REGISTER_STATE_HH__ diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc new file mode 100644 index 000000000..0aa033db1 --- /dev/null +++ b/src/gpu-compute/wavefront.cc @@ -0,0 +1,925 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/wavefront.hh" + +#include "debug/GPUExec.hh" +#include "debug/WavefrontStack.hh" +#include "gpu-compute/code_enums.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" + +Wavefront* +WavefrontParams::create() +{ + return new Wavefront(this); +} + +Wavefront::Wavefront(const Params *p) + : SimObject(p), callArgMem(nullptr) +{ + last_trace = 0; + simdId = p->simdId; + wfSlotId = p->wf_slot_id; + + status = S_STOPPED; + reservedVectorRegs = 0; + startVgprIndex = 0; + outstanding_reqs = 0; + mem_reqs_in_pipe = 0; + outstanding_reqs_wr_gm = 0; + outstanding_reqs_wr_lm = 0; + outstanding_reqs_rd_gm = 0; + outstanding_reqs_rd_lm = 0; + rd_lm_reqs_in_pipe = 0; + rd_gm_reqs_in_pipe = 0; + wr_lm_reqs_in_pipe = 0; + wr_gm_reqs_in_pipe = 0; + + barrier_cnt = 0; + old_barrier_cnt = 0; + stalledAtBarrier = false; + + mem_trace_busy = 0; + old_vgpr_tcnt = 0xffffffffffffffffll; + old_dgpr_tcnt = 0xffffffffffffffffll; + + pendingFetch = false; + dropFetch = false; + condRegState = new ConditionRegisterState(); + maxSpVgprs = 0; + maxDpVgprs = 0; +} + +void +Wavefront::regStats() +{ + srcRegOpDist + .init(0, 4, 2) + .name(name() + ".src_reg_operand_dist") + .desc("number of executed instructions with N source register operands") + ; + + dstRegOpDist + .init(0, 3, 2) + .name(name() + ".dst_reg_operand_dist") + .desc("number of executed instructions with N destination register " + "operands") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueWAXDependencies + .name(name() + ".timesBlockedDueWAXDependencies") + .desc("number of times the wf's instructions are blocked due to WAW " + "or WAR dependencies") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueRAWDependencies + .name(name() + ".timesBlockedDueRAWDependencies") + .desc("number of times the wf's instructions are blocked due to RAW " + "dependencies") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueVrfPortAvail + .name(name() + ".timesBlockedDueVrfPortAvail") + .desc("number of times instructions are blocked due to VRF port " + "availability") + ; +} + +void +Wavefront::init() +{ + reservedVectorRegs = 0; + startVgprIndex = 0; +} + +void +Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) +{ + condRegState->init(num_cregs); + maxSpVgprs = num_sregs; + maxDpVgprs = num_dregs; +} + +Wavefront::~Wavefront() +{ + if (callArgMem) + delete callArgMem; +} + +void +Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr) +{ + wfDynId = _wfDynId; + base_ptr = _base_ptr; + status = S_RUNNING; +} + +bool +Wavefront::isGmInstruction(GPUDynInstPtr ii) +{ + if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || + IS_OT_ATOMIC_PM(ii->opType())) { + return true; + } + + if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || + IS_OT_ATOMIC_GM(ii->opType())) { + + return true; + } + + if (IS_OT_FLAT(ii->opType())) { + return true; + } + + return false; +} + +bool +Wavefront::isLmInstruction(GPUDynInstPtr ii) +{ + if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || + IS_OT_ATOMIC_LM(ii->opType())) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstALU() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || + ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ)) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstBarrier() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstGMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || + IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstLMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstPrivMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || + IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstFlatMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { + + return true; + } + + return false; +} + +// Return true if the Wavefront's instruction +// buffer has branch instruction. +bool +Wavefront::instructionBufferHasBranch() +{ + for (auto it : instructionBuffer) { + GPUDynInstPtr ii = it; + + if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + return true; + } + } + + return false; +} + +// Remap HSAIL register to physical VGPR. +// HSAIL register = virtual register assigned to an operand by HLC compiler +uint32_t +Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) +{ + assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); + // add the offset from where the VGPRs of the wavefront have been assigned + uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; + // HSAIL double precision (DP) register: calculate the physical VGPR index + // assuming that DP registers are placed after SP ones in the VRF. The DP + // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust + // the DP VGPR index before mapping it to the physical VRF address space + if (mode == 1 && size > 4) { + physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); + } + + assert((startVgprIndex <= physicalVgprIndex) && + (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); + + // calculate absolute physical VGPR index + return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); +} + +// Return true if this wavefront is ready +// to execute an instruction of the specified type. +int +Wavefront::ready(itype_e type) +{ + // Check to make sure wave is running + if (status == S_STOPPED || status == S_RETURNING || + instructionBuffer.empty()) { + return 0; + } + + // Is the wave waiting at a barrier + if (stalledAtBarrier) { + if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt, + computeUnit->getRefCounter(dispatchid, wg_id))) { + // Are all threads at barrier? + return 0; + } + old_barrier_cnt = barrier_cnt; + stalledAtBarrier = false; + } + + // Read instruction + GPUDynInstPtr ii = instructionBuffer.front(); + + bool ready_inst M5_VAR_USED = false; + bool glbMemBusRdy = false; + bool glbMemIssueRdy = false; + if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { + for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { + if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) + glbMemBusRdy = true; + if (computeUnit->wfWait[j].prerdy()) + glbMemIssueRdy = true; + } + } + bool locMemBusRdy = false; + bool locMemIssueRdy = false; + if (type == I_SHARED) { + for (int j=0; j < computeUnit->numLocMemUnits; ++j) { + if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) + locMemBusRdy = true; + if (computeUnit->wfWait[j].prerdy()) + locMemIssueRdy = true; + } + } + + // The following code is very error prone and the entire process for + // checking readiness will be fixed eventually. In the meantime, let's + // make sure that we do not silently let an instruction type slip + // through this logic and always return not ready. + if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || + ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG || + IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || + IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || + IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || + IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + panic("next instruction: %s is of unknown type\n", ii->disassemble()); + } + + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", + computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); + + if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + // Here for ALU instruction (barrier) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + // Are there in pipe or outstanding memory requests? + if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + // Here for ALU instruction (nop) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + // Here for ALU instruction (return) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + // Are there in pipe or outstanding memory requests? + if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG)) { + // Here for ALU instruction (all others) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is alu slot free? + return 0; + } + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || + IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + // Here Global memory instruction + if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + // Are there in pipe or outstanding global memory write requests? + if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || + IS_OT_HIST_GM(ii->opType())) { + // Are there in pipe or outstanding global memory read requests? + if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) + return 0; + } + + if (!glbMemIssueRdy) { + // Is WV issue slot free? + return 0; + } + + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + // Here for Shared memory instruction + if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || + IS_OT_HIST_LM(ii->opType())) { + if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (!locMemBusRdy) { + // Is there an available VRF->LDS read bus? + return 0; + } + if (!locMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { + // Can we insert a new request to the LDS Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || + IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { + // Here for Private memory instruction ------------------------ // + if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { + if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || + IS_OT_HIST_PM(ii->opType())) { + if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!glbMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!locMemBusRdy) { + // Is there an available VRF->LDS read bus? + return 0; + } + + if (!glbMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!locMemIssueRdy) { + return 0; + } + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { + // Can we insert a new request to the LDS Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + // are all the operands ready? (RAW, WAW and WAR depedencies met?) + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else { + return 0; + } + + assert(ready_inst); + + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, + simdId, wfSlotId, ii->disassemble()); + + return 1; +} + +void +Wavefront::updateResources() +{ + // Get current instruction + GPUDynInstPtr ii = instructionBuffer.front(); + assert(ii); + computeUnit->vrf[simdId]->updateResources(this, ii); + // Single precision ALU or Branch or Return or Special instruction + if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || + ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + // FIXME: Kernel argument loads are currently treated as ALU operations + // since we don't send memory packets at execution. If we fix that then + // we should map them to one of the memory pipelines + ii->opType()==Enums::OT_KERN_READ || + ii->opType()==Enums::OT_ARG || + ii->opType()==Enums::OT_RET) { + computeUnit->aluPipe[simdId].preset(computeUnit->shader-> + ticks(computeUnit->spBypassLength())); + // this is to enforce a fixed number of cycles per issue slot per SIMD + computeUnit->wfWait[simdId].preset(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_BARRIER) { + computeUnit->wfWait[simdId].preset(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_FLAT_READ) { + assert(Enums::SC_NONE != ii->executedAs()); + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + if ( Enums::SC_SHARED == ii->executedAs() ) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + assert(Enums::SC_NONE != ii->executedAs()); + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (IS_OT_READ_GM(ii->opType())) { + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_GM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_GM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_LM(ii->opType())) { + mem_reqs_in_pipe++; + rd_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_LM(ii->opType())) { + mem_reqs_in_pipe++; + wr_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_LM(ii->opType())) { + mem_reqs_in_pipe++; + wr_lm_reqs_in_pipe++; + rd_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_PM(ii->opType())) { + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_PM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_PM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } +} + +void +Wavefront::exec() +{ + // ---- Exit if wavefront is inactive ----------------------------- // + + if (status == S_STOPPED || status == S_RETURNING || + instructionBuffer.empty()) { + return; + } + + // Get current instruction + + GPUDynInstPtr ii = instructionBuffer.front(); + + const uint32_t old_pc = pc(); + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " + "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, + ii->disassemble(), old_pc); + ii->execute(); + // access the VRF + computeUnit->vrf[simdId]->exec(ii, this); + srcRegOpDist.sample(ii->numSrcRegOperands()); + dstRegOpDist.sample(ii->numDstRegOperands()); + computeUnit->numInstrExecuted++; + computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - + computeUnit->lastExecCycle[simdId]); + computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); + if (pc() == old_pc) { + uint32_t new_pc = old_pc + 1; + // PC not modified by instruction, proceed to next or pop frame + pc(new_pc); + if (new_pc == rpc()) { + popFromReconvergenceStack(); + discardFetch(); + } else { + instructionBuffer.pop_front(); + } + } + + if (computeUnit->shader->hsail_mode==Shader::SIMT) { + const int num_active_lanes = execMask().count(); + computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); + computeUnit->numVecOpsExecuted += num_active_lanes; + if (isGmInstruction(ii)) { + computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); + } else if (isLmInstruction(ii)) { + computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); + } + } + + // ---- Update Vector ALU pipeline and other resources ------------------ // + // Single precision ALU or Branch or Return or Special instruction + if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || + ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + // FIXME: Kernel argument loads are currently treated as ALU operations + // since we don't send memory packets at execution. If we fix that then + // we should map them to one of the memory pipelines + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG || + ii->opType() == Enums::OT_RET) { + computeUnit->aluPipe[simdId].set(computeUnit->shader-> + ticks(computeUnit->spBypassLength())); + + // this is to enforce a fixed number of cycles per issue slot per SIMD + computeUnit->wfWait[simdId].set(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_BARRIER) { + computeUnit->wfWait[simdId].set(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_FLAT_READ) { + assert(Enums::SC_NONE != ii->executedAs()); + + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + assert(Enums::SC_NONE != ii->executedAs()); + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (IS_OT_READ_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } +} + +bool +Wavefront::waitingAtBarrier(int lane) +{ + return bar_cnt[lane] < max_bar_cnt; +} + +void +Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, + const VectorMask& mask) +{ + assert(mask.count()); + reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask)); +} + +void +Wavefront::popFromReconvergenceStack() +{ + assert(!reconvergenceStack.empty()); + + DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, + execMask().to_string().c_str(), pc()); + + reconvergenceStack.pop(); + + DPRINTF(WavefrontStack, "%3i %s\n", pc(), + execMask().to_string().c_str()); + +} + +void +Wavefront::discardFetch() +{ + instructionBuffer.clear(); + dropFetch |=pendingFetch; +} + +uint32_t +Wavefront::pc() const +{ + return reconvergenceStack.top()->pc; +} + +uint32_t +Wavefront::rpc() const +{ + return reconvergenceStack.top()->rpc; +} + +VectorMask +Wavefront::execMask() const +{ + return reconvergenceStack.top()->execMask; +} + +bool +Wavefront::execMask(int lane) const +{ + return reconvergenceStack.top()->execMask[lane]; +} + + +void +Wavefront::pc(uint32_t new_pc) +{ + reconvergenceStack.top()->pc = new_pc; +} diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh new file mode 100644 index 000000000..0abab8e83 --- /dev/null +++ b/src/gpu-compute/wavefront.hh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __WAVEFRONT_HH__ +#define __WAVEFRONT_HH__ + +#include +#include +#include +#include +#include + +#include "base/misc.hh" +#include "base/types.hh" +#include "gpu-compute/condition_register_state.hh" +#include "gpu-compute/lds_state.hh" +#include "gpu-compute/misc.hh" +#include "params/Wavefront.hh" +#include "sim/sim_object.hh" + +static const int MAX_NUM_INSTS_PER_WF = 12; + +/* + * Arguments for the hsail opcode call, are user defined and variable length. + * The hardware/finalizer can support arguments in hardware or use memory to + * pass arguments. For now, let's assume that an unlimited number of arguments + * are supported in hardware (the compiler inlines functions whenver it can + * anyways, so unless someone is interested in the implications of linking/ + * library functions, I think this is a reasonable assumption given the typical + * size of an OpenCL kernel). + * + * Note that call args are different than kernel arguments: + * * All work-items in a kernel refer the same set of kernel arguments + * * Each work-item has it's on set of call args. So a call argument at + * address 0x4 is different for work-item 0 and work-item 1. + * + * Ok, the table below shows an example of how we organize the call arguments in + * the CallArgMem class. + * + * int foo(int arg1, double arg2) + * ___________________________________________________ + * | 0: return.0 | 4: return.1 | ... | 252: return.63 | + * |---------------------------------------------------| + * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | + * |---------------------------------------------------| + * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | + * ___________________________________________________ + */ +class CallArgMem +{ + public: + // pointer to buffer for storing function arguments + uint8_t *mem; + // size of function args + int funcArgsSizePerItem; + + template + int + getLaneOffset(int lane, int addr) + { + return addr * VSZ + sizeof(CType) * lane; + } + + CallArgMem(int func_args_size_per_item) + : funcArgsSizePerItem(func_args_size_per_item) + { + mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); + } + + ~CallArgMem() + { + free(mem); + } + + template + uint8_t* + getLaneAddr(int lane, int addr) + { + return mem + getLaneOffset(lane, addr); + } + + template + void + setLaneAddr(int lane, int addr, CType val) + { + *((CType*)(mem + getLaneOffset(lane, addr))) = val; + } +}; + +/** + * A reconvergence stack entry conveys the necessary state to implement + * control flow divergence. + */ +class ReconvergenceStackEntry { + + public: + ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc, + VectorMask new_mask) : pc(new_pc), rpc(new_rpc), + execMask(new_mask) { + } + + /** + * PC of current instruction. + */ + uint32_t pc; + /** + * PC of the immediate post-dominator instruction, i.e., the value of + * @a pc for the first instruction that will be executed by the wavefront + * when a reconvergence point is reached. + */ + uint32_t rpc; + /** + * Execution mask. + */ + VectorMask execMask; +}; + +class Wavefront : public SimObject +{ + public: + enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; + enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; + + // Base pointer for array of instruction pointers + uint64_t base_ptr; + + uint32_t old_barrier_cnt; + uint32_t barrier_cnt; + uint32_t barrier_id; + uint32_t barrier_slots; + status_e status; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + int kern_id; + // SIMD unit where the WV has been scheduled + int simdId; + // pointer to parent CU + ComputeUnit *computeUnit; + + std::deque instructionBuffer; + + bool pendingFetch; + bool dropFetch; + + // Condition Register State (for HSAIL simulations only) + class ConditionRegisterState *condRegState; + // number of single precision VGPRs required by WF + uint32_t maxSpVgprs; + // number of double precision VGPRs required by WF + uint32_t maxDpVgprs; + // map virtual to physical vector register + uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + bool isGmInstruction(GPUDynInstPtr ii); + bool isLmInstruction(GPUDynInstPtr ii); + bool isOldestInstGMem(); + bool isOldestInstLMem(); + bool isOldestInstPrivMem(); + bool isOldestInstFlatMem(); + bool isOldestInstALU(); + bool isOldestInstBarrier(); + // used for passing spill address to DDInstGPU + uint64_t last_addr[VSZ]; + uint32_t workitemid[3][VSZ]; + uint32_t workitemFlatId[VSZ]; + uint32_t workgroupid[3]; + uint32_t workgroupsz[3]; + uint32_t gridsz[3]; + uint32_t wg_id; + uint32_t wg_sz; + uint32_t dynwaveid; + uint32_t maxdynwaveid; + uint32_t dispatchid; + // outstanding global+local memory requests + uint32_t outstanding_reqs; + // memory requests between scoreboard + // and execute stage not yet executed + uint32_t mem_reqs_in_pipe; + // outstanding global memory write requests + uint32_t outstanding_reqs_wr_gm; + // outstanding local memory write requests + uint32_t outstanding_reqs_wr_lm; + // outstanding global memory read requests + uint32_t outstanding_reqs_rd_gm; + // outstanding local memory read requests + uint32_t outstanding_reqs_rd_lm; + uint32_t rd_lm_reqs_in_pipe; + uint32_t rd_gm_reqs_in_pipe; + uint32_t wr_lm_reqs_in_pipe; + uint32_t wr_gm_reqs_in_pipe; + + int mem_trace_busy; + uint64_t last_trace; + // number of vector registers reserved by WF + int reservedVectorRegs; + // Index into the Vector Register File's namespace where the WF's registers + // will live while the WF is executed + uint32_t startVgprIndex; + + // Old value of destination gpr (for trace) + uint32_t old_vgpr[VSZ]; + // Id of destination gpr (for trace) + uint32_t old_vgpr_id; + // Tick count of last old_vgpr copy + uint64_t old_vgpr_tcnt; + + // Old value of destination gpr (for trace) + uint64_t old_dgpr[VSZ]; + // Id of destination gpr (for trace) + uint32_t old_dgpr_id; + // Tick count of last old_vgpr copy + uint64_t old_dgpr_tcnt; + + // Execution mask at wavefront start + VectorMask init_mask; + + // number of barriers this WF has joined + int bar_cnt[VSZ]; + int max_bar_cnt; + // Flag to stall a wave on barrier + bool stalledAtBarrier; + + // a pointer to the fraction of the LDS allocated + // to this workgroup (thus this wavefront) + LdsChunk *ldsChunk; + + // A pointer to the spill area + Addr spillBase; + // The size of the spill area + uint32_t spillSizePerItem; + // The vector width of the spill area + uint32_t spillWidth; + + // A pointer to the private memory area + Addr privBase; + // The size of the private memory area + uint32_t privSizePerItem; + + // A pointer ot the read-only memory area + Addr roBase; + // size of the read-only memory area + uint32_t roSize; + + // pointer to buffer for storing kernel arguments + uint8_t *kernelArgs; + // unique WF id over all WFs executed across all CUs + uint64_t wfDynId; + + // number of times instruction issue for this wavefront is blocked + // due to VRF port availability + Stats::Scalar numTimesBlockedDueVrfPortAvail; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueWAXDependencies; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueRAWDependencies; + // distribution of executed instructions based on their register + // operands; this is used to highlight the load on the VRF + Stats::Distribution srcRegOpDist; + Stats::Distribution dstRegOpDist; + + // Functions to operate on call argument memory + // argument memory for hsail call instruction + CallArgMem *callArgMem; + void + initCallArgMem(int func_args_size_per_item) + { + callArgMem = new CallArgMem(func_args_size_per_item); + } + + template + CType + readCallArgMem(int lane, int addr) + { + return *((CType*)(callArgMem->getLaneAddr(lane, addr))); + } + + template + void + writeCallArgMem(int lane, int addr, CType val) + { + callArgMem->setLaneAddr(lane, addr, val); + } + + typedef WavefrontParams Params; + Wavefront(const Params *p); + ~Wavefront(); + virtual void init(); + + void + setParent(ComputeUnit *cu) + { + computeUnit = cu; + } + + void start(uint64_t _wfDynId, uint64_t _base_ptr); + + void exec(); + void updateResources(); + int ready(itype_e type); + bool instructionBufferHasBranch(); + void regStats(); + VectorMask get_pred() { return execMask() & init_mask; } + + bool waitingAtBarrier(int lane); + + void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, + const VectorMask& exec_mask); + + void popFromReconvergenceStack(); + + uint32_t pc() const; + + uint32_t rpc() const; + + VectorMask execMask() const; + + bool execMask(int lane) const; + + void pc(uint32_t new_pc); + + void discardFetch(); + + private: + /** + * Stack containing Control Flow Graph nodes (i.e., kernel instructions) + * to be visited by the wavefront, and the associated execution masks. The + * reconvergence stack grows every time the wavefront reaches a divergence + * point (branch instruction), and shrinks every time the wavefront + * reaches a reconvergence point (immediate post-dominator instruction). + */ + std::stack> reconvergenceStack; +}; + +#endif // __WAVEFRONT_HH__ diff --git a/src/mem/protocol/GPU_RfO-SQC.sm b/src/mem/protocol/GPU_RfO-SQC.sm new file mode 100644 index 000000000..1e5f8df74 --- /dev/null +++ b/src/mem/protocol/GPU_RfO-SQC.sm @@ -0,0 +1,667 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:SQC, "GPU SQC (L1 I Cache)") + : Sequencer* sequencer; + CacheMemory * L1cache; + int TCC_select_num_bits; + Cycles issue_latency := 80; // time to send data down to TCC + Cycles l2_hit_latency := 18; + + MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromSQC, network="To", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock"; + + MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response"; + + MessageBuffer * mandatoryQueue; +{ + state_declaration(State, desc="SQC Cache States", default="SQC_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + + I_S, AccessPermission:Busy, desc="Invalid, issued RdBlkS, have not seen response yet"; + S_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack"; + I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCCdir for canceled WB"; + } + + enumeration(Event, desc="SQC Events") { + // Core initiated + Fetch, desc="Fetch"; + + //TCC initiated + TCC_AckS, desc="TCC Ack to Core Request"; + TCC_AckWB, desc="TCC Ack for WB"; + TCC_NackWB, desc="TCC Nack for WB"; + + // Mem sys initiated + Repl, desc="Replacing block from cache"; + + // Probe Events + PrbInvData, desc="probe, return M data"; + PrbInv, desc="probe, no need for data"; + PrbShrData, desc="probe downgrade, return data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return SQC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return SQC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(SQC_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromSQC); + out_port(responseNetwork_out, ResponseMsg, responseFromSQC); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // In Ports + + in_port(probeNetwork_in, TDProbeRequestMsg, probeToSQC) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(responseToSQC_in, ResponseMsg, responseToSQC) { + if (responseToSQC_in.isReady(clockEdge())) { + peek(responseToSQC_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + if (in_msg.State == CoherenceState:Shared) { + trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe); + } else { + error("SQC should not receive TDSysResp other than CoherenceState:Shared"); + } + } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) { + trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) { + trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + assert(in_msg.Type == RubyRequestType:IFETCH); + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(vc_victim, "vc", desc="Victimize E/S Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.InitialRequestTime := curCycle(); + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToSQC_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + sequencer.readCallback(address, cache_entry.DataBlk, + false, MachineType:L1Cache); + APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); + } + + action(xl_loadDone, "xl", desc="remote load done") { + peek(responseToSQC_in, ResponseMsg) { + assert(is_valid(cache_entry)); + sequencer.readCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); + } + } + + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToSQC_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToSQC_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToSQC_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + // Transitions + + // transitions from base + transition(I, Fetch, I_S) {TagArrayRead, TagArrayWrite} { + a_allocate; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // simple hit transitions + transition(S, Fetch) {TagArrayRead, DataArrayRead} { + l_loadDone; + p_popMandatoryQueue; + } + + // recycles from transients + transition({I_S, S_I, I_C}, {Fetch, Repl}) {} { + zz_recycleMandatoryQueue; + } + + transition(S, Repl, S_I) {TagArrayRead} { + t_allocateTBE; + vc_victim; + ic_invCache; + } + + // TCC event + transition(I_S, TCC_AckS, S) {DataArrayRead, DataArrayWrite} { + w_writeCache; + xl_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S_I, TCC_NackWB, I){TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(S_I, TCC_AckWB, I) {TagArrayWrite} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_AckWB, I){TagArrayWrite} { + ss_sendStaleNotification; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_NackWB, I) {TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + // Probe transitions + transition({S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(I_C, PrbInvData, I_C) { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({S}, PrbShrData, S) {DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({I, I_C}, PrbShrData) {TagArrayRead} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C){ + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(I_S, {PrbInv, PrbInvData}) {} { + pi_sendProbeResponseInv; + ic_invCache; + a_allocate; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition(I_S, PrbShrData) {} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(S_I, PrbInvData, I_C) {TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(S_I, PrbInv, I_C) {TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(S_I, PrbShrData) {DataArrayRead} { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } +} diff --git a/src/mem/protocol/GPU_RfO-TCC.sm b/src/mem/protocol/GPU_RfO-TCC.sm new file mode 100644 index 000000000..cfddb3f00 --- /dev/null +++ b/src/mem/protocol/GPU_RfO-TCC.sm @@ -0,0 +1,1199 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:TCC, "TCC Cache") + : CacheMemory * L2cache; + WireBuffer * w_reqToTCCDir; + WireBuffer * w_respToTCCDir; + WireBuffer * w_TCCUnblockToTCCDir; + WireBuffer * w_reqToTCC; + WireBuffer * w_probeToTCC; + WireBuffer * w_respToTCC; + int TCC_select_num_bits; + Cycles l2_request_latency := 1; + Cycles l2_response_latency := 20; + + // To the general response network + MessageBuffer * responseFromTCC, network="To", virtual_network="3", vnet_type="response"; + + // From the general response network + MessageBuffer * responseToTCC, network="From", virtual_network="3", vnet_type="response"; + +{ + // EVENTS + enumeration(Event, desc="TCC Events") { + // Requests coming from the Cores + RdBlk, desc="CPU RdBlk event"; + RdBlkM, desc="CPU RdBlkM event"; + RdBlkS, desc="CPU RdBlkS event"; + CtoD, desc="Change to Dirty request"; + WrVicBlk, desc="L1 Victim (dirty)"; + WrVicBlkShared, desc="L1 Victim (dirty)"; + ClVicBlk, desc="L1 Victim (clean)"; + ClVicBlkShared, desc="L1 Victim (clean)"; + + CPUData, desc="WB data from CPU"; + CPUDataShared, desc="WB data from CPU, NBReqShared 1"; + StaleWB, desc="Stale WB, No data"; + + L2_Repl, desc="L2 Replacement"; + + // Probes + PrbInvData, desc="Invalidating probe, return dirty data"; + PrbInv, desc="Invalidating probe, no need to return data"; + PrbShrData, desc="Downgrading probe, return data"; + + // Coming from Memory Controller + WBAck, desc="ack from memory"; + + CancelWB, desc="Cancel WB from L2"; + } + + // STATES + state_declaration(State, desc="TCC State", default="TCC_State_I") { + M, AccessPermission:Read_Write, desc="Modified"; // No other cache has copy, memory stale + O, AccessPermission:Read_Only, desc="Owned"; // Correct most recent copy, others may exist in S + E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory) + S, AccessPermission:Read_Only, desc="Shared"; // Correct, most recent. If no one in O, then == Memory + I, AccessPermission:Invalid, desc="Invalid"; + + I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M"; + S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E"; + S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S"; + E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data"; + O_M, AccessPermission:Busy, desc="..."; + O_O, AccessPermission:Busy, desc="..."; + O_E, AccessPermission:Busy, desc="..."; + M_M, AccessPermission:Busy, desc="..."; + M_O, AccessPermission:Busy, desc="..."; + M_E, AccessPermission:Busy, desc="..."; + M_S, AccessPermission:Busy, desc="..."; + D_I, AccessPermission:Invalid, desc="drop WB data on the floor when receive"; + MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem"; + MO_I, AccessPermission:Busy, desc="M or O, received L2_Repl, waiting for WBAck from Mem"; + ES_I, AccessPermission:Busy, desc="E or S, received L2_Repl, waiting for WBAck from Mem"; + I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCC_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + + + // OUT PORTS + out_port(w_requestNetwork_out, CPURequestMsg, w_reqToTCCDir); + out_port(w_TCCResp_out, ResponseMsg, w_respToTCCDir); + out_port(responseNetwork_out, ResponseMsg, responseFromTCC); + out_port(w_unblockNetwork_out, UnblockMsg, w_TCCUnblockToTCCDir); + + // IN PORTS + in_port(TDResponse_in, ResponseMsg, w_respToTCC) { + if (TDResponse_in.isReady(clockEdge())) { + peek(TDResponse_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:TDSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } + else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on TDResponse Type"); + } + } + } + } + + // Response Network + in_port(responseNetwork_in, ResponseMsg, responseToTCC) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUData) { + if (in_msg.NbReqShared) { + trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPUData, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on TDResponse Type"); + } + } + } + } + + // probe network + in_port(probeNetwork_in, TDProbeRequestMsg, w_probeToTCC) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, TDProbeRequestMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + if (in_msg.ReturnData) { + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } else { + error("Don't think I should get any of these"); + } + } + } + } + } + + // Request Network + in_port(requestNetwork_in, CPURequestMsg, w_reqToTCC) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + assert(in_msg.Destination.isElement(machineID)); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + } + } + } + + // BEGIN ACTIONS + + action(i_invL2, "i", desc="invalidate TCC cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(rm_sendResponseM, "rm", desc="send Modified response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Modified; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(rs_sendResponseS, "rs", desc="send Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + action(r_requestToTD, "r", desc="Miss in L2, pass on") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (is_valid(cache_entry)) { + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := cache_entry.Dirty; + } + tbe.From := machineID; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(vc_vicClean, "vc", desc="Victimize Clean L2 data") { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.Requestor := machineID; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(vd_vicDirty, "vd", desc="Victimize dirty L2 data") { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.Requestor := machineID; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := cache_entry.DataBlk; + //assert(cache_entry.Dirty); Not needed in TCC where TCC can supply clean data + out_msg.Dirty := cache_entry.Dirty; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + //assert(tbe.Dirty); + out_msg.Dirty := tbe.Dirty; + out_msg.Hit := true; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.State := CoherenceState:NA; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:WrCancel; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(a_allocateBlock, "a", desc="allocate TCC block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(d_writeData, "d", desc="write data to TCC") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + cache_entry.Dirty := in_msg.Dirty; + } + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(rd_copyDataFromRequest, "rd", desc="write data to TCC") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := true; + } + } + + action(f_setFrom, "f", desc="set who WB is expected to come from") { + peek(requestNetwork_in, CPURequestMsg) { + tbe.From := in_msg.Requestor; + } + } + + action(rf_resetFrom, "rf", desc="reset From") { + tbe.From := machineID; + } + + action(wb_data, "wb", desc="write back data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + } + + action(uo_sendUnblockOwner, "uo", desc="state changed to E, M, or O, unblock") { + enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.currentOwner := true; + out_msg.valid := true; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(us_sendUnblockSharer, "us", desc="state changed to S , unblock") { + enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.currentOwner := false; + out_msg.valid := true; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(un_sendUnblockNotValid, "un", desc="state changed toI, unblock") { + enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.currentOwner := false; + out_msg.valid := false; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L2cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pn_popTDResponseQueue, "pn", desc="pop TD response queue") { + TDResponse_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "\z", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + + transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}){TagArrayRead} { + // TCCdir already knows that the block is not here. This is to allocate and get the block. + r_requestToTD; + p_popRequestQueue; + } + +// check + transition({M, O}, RdBlk, O){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chancing + p_popRequestQueue; + } + +//check + transition({E, S}, RdBlk, S){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chancing + p_popRequestQueue; + } + +// check + transition({M, O}, RdBlkS, O){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chance sharing + p_popRequestQueue; + } + +//check + transition({E, S}, RdBlkS, S){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chance sharing + p_popRequestQueue; + } + +// check + transition(M, RdBlkM, I){TagArrayRead, TagArrayWrite} { + rm_sendResponseM; + i_invL2; + p_popRequestQueue; + } + + //check + transition(E, RdBlkM, I){TagArrayRead, TagArrayWrite} { + rm_sendResponseM; + i_invL2; + p_popRequestQueue; + } + +// check + transition({I}, WrVicBlk, I_M){TagArrayRead} { + a_allocateBlock; + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) { + zz_recycleRequestQueue; + } + +//check + transition({I}, WrVicBlkShared, I_O) {TagArrayRead}{ + a_allocateBlock; + t_allocateTBE; + f_setFrom; +// rd_copyDataFromRequest; + w_sendResponseWBAck; + p_popRequestQueue; + } + +//check + transition(S, WrVicBlkShared, S_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(S, WrVicBlk, S_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, WrVicBlk, E_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, WrVicBlkShared, E_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(O, WrVicBlk, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(O, WrVicBlkShared, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, WrVicBlk, M_M){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, WrVicBlkShared, M_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +//check + transition({I}, ClVicBlk, I_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({I}, ClVicBlkShared, I_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + +//check + transition(S, ClVicBlkShared, S_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, ClVicBlk, E_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, ClVicBlkShared, E_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(O, ClVicBlk, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// check. Original L3 ahd it going from O to O_S. Something can go from O to S only on writeback. + transition(O, ClVicBlkShared, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, ClVicBlk, M_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, ClVicBlkShared, M_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + + transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) { + a_allocateBlock; + t_allocateTBE; + f_setFrom; + r_requestToTD; + p_popRequestQueue; + } + + transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) { + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_M, CPUData, M){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUData, E){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(S_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_E, CPUDataShared, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition({D_I}, {CPUData, CPUDataShared}, I){TagArrayWrite} { + un_sendUnblockNotValid; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(MOD_I, {CPUData, CPUDataShared}, MO_I) { + un_sendUnblockNotValid; + rf_resetFrom; + pr_popResponseQueue; + } + + transition({O,S,I}, CPUData) { + pr_popResponseQueue; + } + + transition({M, O}, L2_Repl, MO_I){TagArrayRead, DataArrayRead} { + t_allocateTBE; + vd_vicDirty; + i_invL2; + } + + transition({E, S,}, L2_Repl, ES_I){TagArrayRead, DataArrayRead} { + t_allocateTBE; + vc_vicClean; + i_invL2; + } + + transition({I_M, I_O, S_M, S_O, E_M, E_O}, L2_Repl) { + zz_recycleRequestQueue; + } + + transition({O_M, O_O, O_E, M_M, M_O, M_E, M_S}, L2_Repl) { + zz_recycleRequestQueue; + } + + transition({I_E, I_S, S_E, S_S, E_E, E_S}, L2_Repl) { + zz_recycleRequestQueue; + } + + transition({M, O}, PrbInvData, I){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + i_invL2; + pp_popProbeQueue; + } + + transition(I, PrbInvData){TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({E, S}, PrbInvData, I){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + i_invL2; + pp_popProbeQueue; + } + + transition({M, O, E, S, I}, PrbInv, I){TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + i_invL2; // nothing will happen in I + pp_popProbeQueue; + } + + transition({M, O}, PrbShrData, O){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({E, S}, PrbShrData, S){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(I, PrbShrData){TagArrayRead} { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) { + pdt_sendProbeResponseDataFromTBE; + pp_popProbeQueue; + } + + transition(ES_I, PrbInvData, I_C) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({ES_I,MO_I}, PrbInv, I_C) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({ES_I, MO_I}, PrbShrData) { + pdt_sendProbeResponseDataFromTBE; + pp_popProbeQueue; + } + + transition(I_C, {PrbInvData, PrbInv}) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(I_C, PrbShrData) { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(MOD_I, WBAck, D_I) { + pn_popTDResponseQueue; + } + + transition(MO_I, WBAck, I){TagArrayWrite} { + dt_deallocateTBE; + pn_popTDResponseQueue; + } + + // this can only be a spurious CPUData from a shared block. + transition(MO_I, CPUData) { + pr_popResponseQueue; + } + + transition(ES_I, WBAck, I){TagArrayWrite} { + dt_deallocateTBE; + pn_popTDResponseQueue; + } + + transition(I_C, {WBAck}, I){TagArrayWrite} { + dt_deallocateTBE; + pn_popTDResponseQueue; + } + + transition({I_M, I_O, I_E, I_S}, StaleWB, I){TagArrayWrite} { + un_sendUnblockNotValid; + dt_deallocateTBE; + i_invL2; + pr_popResponseQueue; + } + + transition({S_S, S_O, S_M, S_E}, StaleWB, S){TagArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({E_M, E_O, E_E, E_S}, StaleWB, E){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({O_M, O_O, O_E}, StaleWB, O){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({M_M, M_O, M_E, M_S}, StaleWB, M){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(D_I, StaleWB, I) {TagArrayWrite}{ + un_sendUnblockNotValid; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(MOD_I, StaleWB, MO_I) { + un_sendUnblockNotValid; + rf_resetFrom; + pr_popResponseQueue; + } + +} diff --git a/src/mem/protocol/GPU_RfO-TCCdir.sm b/src/mem/protocol/GPU_RfO-TCCdir.sm new file mode 100644 index 000000000..8f58d6ebb --- /dev/null +++ b/src/mem/protocol/GPU_RfO-TCCdir.sm @@ -0,0 +1,2672 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Mithuna Thottethodi + */ + +machine(MachineType:TCCdir, "AMD read-for-ownership directory for TCC (aka GPU L2)") +: CacheMemory * directory; + // Convention: wire buffers are prefixed with "w_" for clarity + WireBuffer * w_reqToTCCDir; + WireBuffer * w_respToTCCDir; + WireBuffer * w_TCCUnblockToTCCDir; + WireBuffer * w_reqToTCC; + WireBuffer * w_probeToTCC; + WireBuffer * w_respToTCC; + int TCC_select_num_bits; + Cycles response_latency := 5; + Cycles directory_latency := 6; + Cycles issue_latency := 120; + + // From the TCPs or SQCs + MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromTCP, network="From", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromTCP, network="From", virtual_network="5", vnet_type="unblock"; + + // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC. + MessageBuffer * probeToCore, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response"; + + // From the NB + MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response"; + // To the NB + MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * triggerQueue, random="false"; +{ + // STATES + state_declaration(State, desc="Directory states", default="TCCdir_State_I") { + // Base states + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Invalid, desc="Shared"; + E, AccessPermission:Invalid, desc="Shared"; + O, AccessPermission:Invalid, desc="Owner"; + M, AccessPermission:Invalid, desc="Modified"; + + CP_I, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to invalid"; + B_I, AccessPermission:Invalid, desc="Blocked, need not send data after acks are in, going to invalid"; + CP_O, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to owned"; + CP_S, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to shared"; + CP_OM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to O_M"; + CP_SM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to S_M"; + CP_ISM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M"; + CP_IOM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M"; + CP_OSIW, AccessPermission:Invalid, desc="Blocked, must send data after acks+CancelWB are in, going to I_C"; + + + // Transient states and busy states used for handling side (TCC-facing) interactions + BW_S, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + BW_E, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + BW_O, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + BW_M, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + + // Transient states and busy states used for handling upward (TCP-facing) interactions + I_M, AccessPermission:Invalid, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_ES, AccessPermission:Invalid, desc="Invalid, issued RdBlk, have not seen response yet"; + I_S, AccessPermission:Invalid, desc="Invalid, issued RdBlkS, have not seen response yet"; + BBS_S, AccessPermission:Invalid, desc="Blocked, going from S to S"; + BBO_O, AccessPermission:Invalid, desc="Blocked, going from O to O"; + BBM_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for data to forward"; + BBM_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for data to forward"; + BB_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for unblock"; + BB_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for unblock"; + BB_OO, AccessPermission:Invalid, desc="Blocked, going from O to O (adding sharers), waiting for unblock"; + BB_S, AccessPermission:Invalid, desc="Blocked, going to S, waiting for (possible multiple) unblock(s)"; + BBS_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M"; + BBO_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M"; + BBS_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade"; + BBO_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade"; + S_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet"; + O_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet"; + + // + BBB_S, AccessPermission:Invalid, desc="Blocked, going to S after core unblock"; + BBB_M, AccessPermission:Invalid, desc="Blocked, going to M after core unblock"; + BBB_E, AccessPermission:Invalid, desc="Blocked, going to E after core unblock"; + + VES_I, AccessPermission:Invalid, desc="TCC replacement, waiting for clean WB ack"; + VM_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack"; + VO_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack"; + VO_S, AccessPermission:Invalid, desc="TCC owner replacement, waiting for dirty WB ack"; + + ES_I, AccessPermission:Invalid, desc="L1 replacement, waiting for clean WB ack"; + MO_I, AccessPermission:Invalid, desc="L1 replacement, waiting for dirty WB ack"; + + I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB for canceled WB"; + I_W, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB; canceled WB raced with directory invalidation"; + + // Recall States + BRWD_I, AccessPermission:Invalid, desc="Recalling, waiting for WBAck and Probe Data responses"; + BRW_I, AccessPermission:Read_Write, desc="Recalling, waiting for WBAck"; + BRD_I, AccessPermission:Invalid, desc="Recalling, waiting for Probe Data responses"; + + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + + // EVENTS + enumeration(Event, desc="TCC Directory Events") { + // Upward facing events (TCCdir w.r.t. TCP/SQC and TCC behaves like NBdir behaves with TCP/SQC and L3 + + // Directory Recall + Recall, desc="directory cache is full"; + // CPU requests + CPUWrite, desc="Initial req from core, sent to TCC"; + NoCPUWrite, desc="Initial req from core, but non-exclusive clean data; can be discarded"; + CPUWriteCancel, desc="Initial req from core, sent to TCC"; + + // Requests from the TCPs + RdBlk, desc="RdBlk event"; + RdBlkM, desc="RdBlkM event"; + RdBlkS, desc="RdBlkS event"; + CtoD, desc="Change to Dirty request"; + + // TCC writebacks + VicDirty, desc="..."; + VicDirtyLast, desc="..."; + VicClean, desc="..."; + NoVic, desc="..."; + StaleVic, desc="..."; + CancelWB, desc="TCC got invalidating probe, canceled WB"; + + // Probe Responses from TCP/SQCs + CPUPrbResp, desc="Probe response from TCP/SQC"; + TCCPrbResp, desc="Probe response from TCC"; + + ProbeAcksComplete, desc="All acks received"; + ProbeAcksCompleteReissue, desc="All acks received, changing CtoD to reissue"; + + CoreUnblock, desc="unblock from TCP/SQC"; + LastCoreUnblock, desc="Last unblock from TCP/SQC"; + TCCUnblock, desc="unblock from TCC (current owner)"; + TCCUnblock_Sharer, desc="unblock from TCC (a sharer, not owner)"; + TCCUnblock_NotValid,desc="unblock from TCC (not valid...caused by stale writebacks)"; + + // Downward facing events + + // NB initiated + NB_AckS, desc="NB Ack to TCC Request"; + NB_AckE, desc="NB Ack to TCC Request"; + NB_AckM, desc="NB Ack to TCC Request"; + NB_AckCtoD, desc="NB Ack to TCC Request"; + NB_AckWB, desc="NB Ack for clean WB"; + + + // Incoming Probes from NB + PrbInvData, desc="Invalidating probe, return dirty data"; + PrbInv, desc="Invalidating probe, no need to return data"; + PrbShrData, desc="Downgrading probe, return data"; + } + + + // TYPES + + // Entry for directory + structure(Entry, desc="...", interface='AbstractCacheEntry') { + State CacheState, desc="Cache state (Cache of directory entries)"; + DataBlock DataBlk, desc="data for the block"; + NetDest Sharers, desc="Sharers for this block"; + NetDest Owner, desc="Owner of this block"; + NetDest MergedSharers, desc="Read sharers who are merged on a request"; + int WaitingUnblocks, desc="Number of acks we're waiting for"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="DataBlk"; + bool Dirty, desc="Is the data dirty?"; + MachineID Requestor, desc="requestor"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID UntransferredOwner, desc = "Untransferred owner for an upgrade transaction"; + bool UntransferredOwnerExists, desc = "1 if Untransferred owner exists for an upgrade transaction"; + bool Cached, desc="data hit in Cache"; + bool Shared, desc="victim hit by shared probe"; + bool Upgrade, desc="An upgrade request in progress"; + bool CtoD, desc="Saved sysack info"; + CoherenceState CohState, desc="Saved sysack info"; + MessageSizeType MessageSize, desc="Saved sysack info"; + MachineID Sender, desc="sender"; + } + + structure(TBETable, external = "yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + // ** OBJECTS ** + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + NetDest TCC_dir_subtree; + NetDest temp; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + + + bool presentOrAvail(Addr addr) { + return directory.isTagPresent(addr) || directory.cacheAvail(addr); + } + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", directory.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + assert(false); + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCCdir_State_to_permission(state)); + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCCdir_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCCdir_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + + if (state == State:S) { + assert(cache_entry.Owner.count() == 0); + } + + if (state == State:O) { + assert(cache_entry.Owner.count() == 1); + assert(cache_entry.Sharers.isSuperset(cache_entry.Owner) == false); + } + + if (state == State:M) { + assert(cache_entry.Owner.count() == 1); + assert(cache_entry.Sharers.count() == 0); + } + + if (state == State:E) { + assert(cache_entry.Owner.count() == 0); + assert(cache_entry.Sharers.count() == 1); + } + } + } + + + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + directory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + directory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + directory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + directory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return directory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return directory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return directory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return directory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // ** OUT_PORTS ** + + // Three classes of ports + // Class 1: downward facing network links to NB + out_port(requestToNB_out, CPURequestMsg, requestToNB); + out_port(responseToNB_out, ResponseMsg, responseToNB); + out_port(unblockToNB_out, UnblockMsg, unblockToNB); + + + // Class 2: upward facing ports to GPU cores + out_port(probeToCore_out, TDProbeRequestMsg, probeToCore); + out_port(responseToCore_out, ResponseMsg, responseToCore); + + // Class 3: sideward facing ports (on "wirebuffer" links) to TCC + out_port(w_requestTCC_out, CPURequestMsg, w_reqToTCC); + out_port(w_probeTCC_out, NBProbeRequestMsg, w_probeToTCC); + out_port(w_respTCC_out, ResponseMsg, w_respToTCC); + + + // local trigger port + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + // + // request queue going to NB + // + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=8) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + assert(is_valid(tbe)); + Entry cache_entry := getCacheEntry(in_msg.addr); + if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == false)) { + trigger(Event:ProbeAcksComplete, in_msg.addr, cache_entry, tbe); + } else if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == true)) { + trigger(Event:ProbeAcksCompleteReissue, in_msg.addr, cache_entry, tbe); + } + } + } + } + + // Unblock Networks (TCCdir can receive unblocks from TCC, TCPs) + // Port on first (of three) wire buffers from TCC + in_port(w_TCCUnblock_in, UnblockMsg, w_TCCUnblockToTCCDir, rank=7) { + if (w_TCCUnblock_in.isReady(clockEdge())) { + peek(w_TCCUnblock_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.currentOwner) { + trigger(Event:TCCUnblock, in_msg.addr, cache_entry, tbe); + } else if (in_msg.valid) { + trigger(Event:TCCUnblock_Sharer, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:TCCUnblock_NotValid, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(unblockNetwork_in, UnblockMsg, unblockFromTCP, rank=6) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if(cache_entry.WaitingUnblocks == 1) { + trigger(Event:LastCoreUnblock, in_msg.addr, cache_entry, tbe); + } + else { + trigger(Event:CoreUnblock, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + //Responses from TCC, and Cores + // Port on second (of three) wire buffers from TCC + in_port(w_TCCResponse_in, ResponseMsg, w_respToTCCDir, rank=5) { + if (w_TCCResponse_in.isReady(clockEdge())) { + peek(w_TCCResponse_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:TCCPrbResp, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(responseNetwork_in, ResponseMsg, responseFromTCP, rank=4) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:CPUPrbResp, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + // Port on third (of three) wire buffers from TCC + in_port(w_TCCRequest_in, CPURequestMsg, w_reqToTCCDir, rank=3) { + if(w_TCCRequest_in.isReady(clockEdge())) { + peek(w_TCCRequest_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:WrCancel) { + trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) { + // if modified, or owner with no other sharers + if ((cache_entry.CacheState == State:M) || (cache_entry.Sharers.count() == 0)) { + assert(cache_entry.Owner.count()==1); + trigger(Event:VicDirtyLast, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:VicDirty, in_msg.addr, cache_entry, tbe); + } + } else { + trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe); + } + } else { + if (in_msg.Type == CoherenceRequestType:VicClean) { + if (is_valid(cache_entry) && cache_entry.Sharers.isElement(in_msg.Requestor)) { + if (cache_entry.Sharers.count() == 1) { + // Last copy, victimize to L3 + trigger(Event:VicClean, in_msg.addr, cache_entry, tbe); + } else { + // Either not the last copy or stall. No need to victimmize + // remove sharer from sharer list + assert(cache_entry.Sharers.count() > 1); + trigger(Event:NoVic, in_msg.addr, cache_entry, tbe); + } + } else { + trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe); + } + } + } + } + } + } + + in_port(responseFromNB_in, ResponseMsg, responseFromNB, rank=2) { + if (responseFromNB_in.isReady(clockEdge())) { + peek(responseFromNB_in, ResponseMsg, block_on="addr") { + + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.State == CoherenceState:Modified) { + if (in_msg.CtoD) { + trigger(Event:NB_AckCtoD, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Finally handling incoming requests (from TCP) and probes (from NB). + + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB, rank=1) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + DPRINTF(RubySlicc, "%s\n", in_msg); + DPRINTF(RubySlicc, "machineID: %s\n", machineID); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) { + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Type == CoherenceRequestType:VicDirty) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if(is_valid(cache_entry) && (cache_entry.Sharers.count() + cache_entry.Owner.count() ) >1) { + trigger(Event:NoCPUWrite, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WrCancel) { + trigger(Event:CPUWriteCancel, in_msg.addr, cache_entry, tbe); + } + } else { + // All requests require a directory entry + Addr victim := directory.cacheProbe(in_msg.addr); + trigger(Event:Recall, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + + + + + // Actions + + //Downward facing actions + + action(c_clearOwner, "c", desc="Clear the owner field") { + cache_entry.Owner.clear(); + } + + action(rS_removeRequesterFromSharers, "rS", desc="Remove unblocker from sharer list") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Sharers.remove(in_msg.Sender); + } + } + + action(rT_removeTCCFromSharers, "rT", desc="Remove TCC from sharer list") { + peek(w_TCCRequest_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + } + } + + action(rO_removeOriginalRequestorFromSharers, "rO", desc="Remove replacing core from sharer list") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + } + } + + action(rC_removeCoreFromSharers, "rC", desc="Remove replacing core from sharer list") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + } + } + + action(rCo_removeCoreFromOwner, "rCo", desc="Remove replacing core from sharer list") { + // Note that under some cases this action will try to remove a stale owner + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.Owner.remove(in_msg.Requestor); + } + } + + action(rR_removeResponderFromSharers, "rR", desc="Remove responder from sharer list") { + peek(responseNetwork_in, ResponseMsg) { + cache_entry.Sharers.remove(in_msg.Sender); + } + } + + action(nC_sendNullWBAckToCore, "nC", desc = "send a null WB Ack to release core") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBNack; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := in_msg.MessageSize; + } + } + } + + action(nT_sendNullWBAckToTCC, "nT", desc = "send a null WB Ack to release TCC") { + peek(w_TCCRequest_in, CPURequestMsg) { + enqueue(w_respTCC_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := in_msg.MessageSize; + } + } + } + + action(eto_moveExSharerToOwner, "eto", desc="move the current exclusive sharer to owner") { + assert(cache_entry.Sharers.count() == 1); + assert(cache_entry.Owner.count() == 0); + cache_entry.Owner := cache_entry.Sharers; + cache_entry.Sharers.clear(); + APPEND_TRANSITION_COMMENT(" new owner "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + + action(aT_addTCCToSharers, "aT", desc="Add TCC to sharer list") { + peek(w_TCCUnblock_in, UnblockMsg) { + cache_entry.Sharers.add(in_msg.Sender); + } + } + + action(as_addToSharers, "as", desc="Add unblocker to sharer list") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Sharers.add(in_msg.Sender); + } + } + + action(c_moveOwnerToSharer, "cc", desc="Move owner to sharers") { + cache_entry.Sharers.addNetDest(cache_entry.Owner); + cache_entry.Owner.clear(); + } + + action(cc_clearSharers, "\c", desc="Clear the sharers field") { + cache_entry.Sharers.clear(); + } + + action(e_ownerIsUnblocker, "e", desc="The owner is now the unblocker") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Owner.clear(); + cache_entry.Owner.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" tcp_ub owner "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + } + + action(eT_ownerIsUnblocker, "eT", desc="TCC (unblocker) is now owner") { + peek(w_TCCUnblock_in, UnblockMsg) { + cache_entry.Owner.clear(); + cache_entry.Owner.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" tcc_ub owner "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + } + + action(ctr_copyTCCResponseToTBE, "ctr", desc="Copy TCC probe response data to TBE") { + peek(w_TCCResponse_in, ResponseMsg) { + // Overwrite data if tbe does not hold dirty data. Stop once it is dirty. + if(tbe.Dirty == false) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.Sender := in_msg.Sender; + } + DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk)); + } + } + + action(ccr_copyCoreResponseToTBE, "ccr", desc="Copy core probe response data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + // Overwrite data if tbe does not hold dirty data. Stop once it is dirty. + if(tbe.Dirty == false) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + + if(tbe.Sender == machineID) { + tbe.Sender := in_msg.Sender; + } + } + DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk)); + } + } + + action(cd_clearDirtyBitTBE, "cd", desc="Clear Dirty bit in TBE") { + tbe.Dirty := false; + } + + action(n_issueRdBlk, "n-", desc="Issue RdBlk") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(rU_rememberUpgrade, "rU", desc="Remember that this was an upgrade") { + tbe.Upgrade := true; + } + + action(ruo_rememberUntransferredOwner, "ruo", desc="Remember the untransferred owner") { + peek(responseNetwork_in, ResponseMsg) { + if(in_msg.UntransferredOwner == true) { + tbe.UntransferredOwner := in_msg.Sender; + tbe.UntransferredOwnerExists := true; + } + DPRINTF(RubySlicc, "%s\n", (in_msg)); + } + } + + action(ruoT_rememberUntransferredOwnerTCC, "ruoT", desc="Remember the untransferred owner") { + peek(w_TCCResponse_in, ResponseMsg) { + if(in_msg.UntransferredOwner == true) { + tbe.UntransferredOwner := in_msg.Sender; + tbe.UntransferredOwnerExists := true; + } + DPRINTF(RubySlicc, "%s\n", (in_msg)); + } + } + + action(vd_victim, "vd", desc="Victimize M/O Data") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Dirty := true; + } + } + + action(vc_victim, "vc", desc="Victimize E/S Data") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Dirty := false; + } + } + + + action(sT_sendRequestToTCC, "sT", desc="send request to TCC") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(w_requestTCC_out, CPURequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + } + APPEND_TRANSITION_COMMENT(" requestor "); + APPEND_TRANSITION_COMMENT(in_msg.Requestor); + + } + } + + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + + temp := cache_entry.Sharers; + temp.addNetDest(cache_entry.Owner); + if (temp.isElement(tcc)) { + temp.remove(tcc); + } + if (temp.count() > 0) { + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination := temp; + tbe.NumPendingAcks := temp.count(); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + + action(ls2_probeShrL2Data, "ls2", desc="local probe downgrade L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(s2_probeShrL2Data, "s2", desc="probe shared L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(ldc_probeInvCoreData, "ldc", desc="local probe to inv cores, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + peek(coreRequestNetwork_in, CPURequestMsg) { + NetDest dest:= cache_entry.Sharers; + dest.addNetDest(cache_entry.Owner); + if(dest.isElement(tcc)){ + dest.remove(tcc); + } + dest.remove(in_msg.Requestor); + tbe.NumPendingAcks := dest.count(); + if (dest.count()>0){ + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + + out_msg.Destination.addNetDest(dest); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + } + + action(ld2_probeInvL2Data, "ld2", desc="local probe inv L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(dc_probeInvCoreData, "dc", desc="probe inv cores + TCC, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + + out_msg.Destination.addNetDest(cache_entry.Sharers); + out_msg.Destination.addNetDest(cache_entry.Owner); + tbe.NumPendingAcks := cache_entry.Sharers.count() + cache_entry.Owner.count(); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + if (out_msg.Destination.isElement(tcc)) { + out_msg.Destination.remove(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + + action(d2_probeInvL2Data, "d2", desc="probe inv L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(lpc_probeInvCore, "lpc", desc="local probe inv cores, no data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + TCC_dir_subtree.broadcast(MachineType:TCP); + TCC_dir_subtree.broadcast(MachineType:SQC); + + temp := cache_entry.Sharers; + temp := temp.OR(cache_entry.Owner); + TCC_dir_subtree := TCC_dir_subtree.AND(temp); + tbe.NumPendingAcks := TCC_dir_subtree.count(); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + if(TCC_dir_subtree.isElement(in_msg.Requestor)) { + TCC_dir_subtree.remove(in_msg.Requestor); + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } + + if(TCC_dir_subtree.count() > 0) { + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.localCtoD := true; + + out_msg.Destination.addNetDest(TCC_dir_subtree); + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + } + + action(ipc_probeInvCore, "ipc", desc="probe inv cores, no data") { + TCC_dir_subtree.broadcast(MachineType:TCP); + TCC_dir_subtree.broadcast(MachineType:SQC); + + temp := cache_entry.Sharers; + temp := temp.OR(cache_entry.Owner); + TCC_dir_subtree := TCC_dir_subtree.AND(temp); + tbe.NumPendingAcks := TCC_dir_subtree.count(); + if(TCC_dir_subtree.count() > 0) { + + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + + out_msg.Destination.addNetDest(TCC_dir_subtree); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + + action(i2_probeInvL2, "i2", desc="probe inv L2, no data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(mc_cancelWB, "mc", desc="send writeback cancel to NB directory") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:WrCancel; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(sCS_sendCollectiveResponseS, "sCS", desc="send shared response to all merged TCP/SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := tbe.Sender; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.CtoD := false; + out_msg.State := CoherenceState:Shared; + out_msg.Destination.addNetDest(cache_entry.MergedSharers); + out_msg.Shared := tbe.Shared; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sS_sendResponseS, "sS", desc="send shared response to TCP/SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := tbe.Sender; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.CtoD := false; + out_msg.State := CoherenceState:Shared; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := tbe.Shared; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sM_sendResponseM, "sM", desc="send response to TCP/SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := tbe.Sender; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.CtoD := false; + out_msg.State := CoherenceState:Modified; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := tbe.Shared; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + + + action(fw2_forwardWBAck, "fw2", desc="forward WBAck to TCC") { + peek(responseFromNB_in, ResponseMsg) { + if(tbe.OriginalRequestor != machineID) { + enqueue(w_respTCC_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Sender := machineID; + //out_msg.DataBlk := tbe.DataBlk; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := in_msg.MessageSize; + } + } + } + } + + action(sa_saveSysAck, "sa", desc="Save SysAck ") { + peek(responseFromNB_in, ResponseMsg) { + tbe.Dirty := in_msg.Dirty; + if (tbe.Dirty == false) { + tbe.DataBlk := in_msg.DataBlk; + } + else { + tbe.DataBlk := tbe.DataBlk; + } + tbe.CtoD := in_msg.CtoD; + tbe.CohState := in_msg.State; + tbe.Shared := in_msg.Shared; + tbe.MessageSize := in_msg.MessageSize; + } + } + + action(fsa_forwardSavedAck, "fsa", desc="forward saved SysAck to TCP or SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + if (tbe.Dirty == false) { + out_msg.DataBlk := tbe.DataBlk; + } + else { + out_msg.DataBlk := tbe.DataBlk; + } + out_msg.CtoD := tbe.CtoD; + out_msg.State := tbe.CohState; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := tbe.Shared; + out_msg.MessageSize := tbe.MessageSize; + out_msg.Dirty := tbe.Dirty; + out_msg.Sender := tbe.Sender; + } + } + + action(fa_forwardSysAck, "fa", desc="forward SysAck to TCP or SQC") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + if (tbe.Dirty == false) { + out_msg.DataBlk := in_msg.DataBlk; + tbe.Sender := machineID; + } + else { + out_msg.DataBlk := tbe.DataBlk; + } + out_msg.CtoD := in_msg.CtoD; + out_msg.State := in_msg.State; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Dirty := in_msg.Dirty; + out_msg.Sender := tbe.Sender; + DPRINTF(RubySlicc, "%s\n", (out_msg.DataBlk)); + } + } + } + + action(pso_probeSharedDataOwner, "pso", desc="probe shared data at owner") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if (cache_entry.Owner.isElement(tcc)) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + else { // i.e., owner is a core + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.addNetDest(cache_entry.Owner); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + tbe.NumPendingAcks := 1; + } + + action(i_popIncomingRequestQueue, "i", desc="Pop incoming request queue") { + coreRequestNetwork_in.dequeue(clockEdge()); + } + + action(j_popIncomingUnblockQueue, "j", desc="Pop incoming unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(pk_popResponseQueue, "pk", desc="Pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="Pop incoming probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(pR_popResponseFromNBQueue, "pR", desc="Pop incoming Response queue From NB") { + responseFromNB_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pl_popTCCRequestQueue, "pl", desc="pop TCC request queue") { + w_TCCRequest_in.dequeue(clockEdge()); + } + + action(plr_popTCCResponseQueue, "plr", desc="pop TCC response queue") { + w_TCCResponse_in.dequeue(clockEdge()); + } + + action(plu_popTCCUnblockQueue, "plu", desc="pop TCC unblock queue") { + w_TCCUnblock_in.dequeue(clockEdge()); + } + + + action(m_addUnlockerToSharers, "m", desc="Add the unlocker to the sharer list") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Sharers.add(in_msg.Sender); + cache_entry.MergedSharers.remove(in_msg.Sender); + assert(cache_entry.WaitingUnblocks >= 0); + cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks - 1; + } + } + + action(q_addOutstandingMergedSharer, "q", desc="Increment outstanding requests") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.MergedSharers.add(in_msg.Requestor); + cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks + 1; + } + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockToNB_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(zz_recycleRequest, "\z", desc="Recycle the request queue") { + coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(yy_recycleTCCRequestQueue, "yy", desc="recycle yy request queue") { + w_TCCRequest_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xz_recycleResponseQueue, "xz", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xx_recycleTCCResponseQueue, "xx", desc="recycle TCC response queue") { + w_TCCResponse_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(vv_recycleTCCUnblockQueue, "vv", desc="Recycle the probe request queue") { + w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xy_recycleUnblockQueue, "xy", desc="Recycle the probe request queue") { + w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(ww_recycleProbeRequest, "ww", desc="Recycle the probe request queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } + + action(o_checkForAckCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" tbe acks "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(tp_allocateTBE, "tp", desc="allocate TBE Entry for upward transactions") { + check_allocate(TBEs); + peek(probeNetwork_in, NBProbeRequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.Dirty := false; + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + } + } + + action(tv_allocateTBE, "tv", desc="allocate TBE Entry for TCC transactions") { + check_allocate(TBEs); + peek(w_TCCRequest_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := in_msg.DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs);//check whether resources are full + peek(coreRequestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.Upgrade := false; + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + tbe.Sender := machineID; + } + } + + action(tr_allocateTBE, "tr", desc="allocate TBE Entry for recall") { + check_allocate(TBEs);//check whether resources are full + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.Upgrade := false; + tbe.OriginalRequestor := machineID; //Recall request, Self initiated + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + } + + action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + + action(d_allocateDir, "d", desc="allocate Directory Cache") { + if (is_invalid(cache_entry)) { + set_cache_entry(directory.allocate(address, new Entry)); + } + } + + action(dd_deallocateDir, "dd", desc="deallocate Directory Cache") { + if (is_valid(cache_entry)) { + directory.deallocate(address); + } + unset_cache_entry(); + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(wb_data, "wb", desc="write back data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(y_writeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (!tbe.Dirty || in_msg.Dirty) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(ty_writeTCCDataToTBE, "ty", desc="write TCC Probe Data to TBE") { + peek(w_TCCResponse_in, ResponseMsg) { + if (!tbe.Dirty || in_msg.Dirty) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + directory.setMRU(address); + } + + // TRANSITIONS + + // Handling TCP/SQC requests (similar to how NB dir handles TCC events with some changes to account for stateful directory). + + + // transitions from base + transition(I, RdBlk, I_ES){TagArrayRead} { + d_allocateDir; + t_allocateTBE; + n_issueRdBlk; + i_popIncomingRequestQueue; + } + + transition(I, RdBlkS, I_S){TagArrayRead} { + d_allocateDir; + t_allocateTBE; + nS_issueRdBlkS; + i_popIncomingRequestQueue; + } + + + transition(I_S, NB_AckS, BBB_S) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(I_ES, NB_AckS, BBB_S) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(I_ES, NB_AckE, BBB_E) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition({S_M, O_M}, {NB_AckCtoD,NB_AckM}, BBB_M) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(I_M, NB_AckM, BBB_M) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(BBB_M, CoreUnblock, M){TagArrayWrite} { + c_clearOwner; + cc_clearSharers; + e_ownerIsUnblocker; + uu_sendUnblock; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(BBB_S, CoreUnblock, S){TagArrayWrite} { + as_addToSharers; + uu_sendUnblock; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(BBB_E, CoreUnblock, E){TagArrayWrite} { + as_addToSharers; + uu_sendUnblock; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + + transition(I, RdBlkM, I_M){TagArrayRead} { + d_allocateDir; + t_allocateTBE; + nM_issueRdBlkM; + i_popIncomingRequestQueue; + } + + // + transition(S, {RdBlk, RdBlkS}, BBS_S){TagArrayRead} { + t_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + // Merging of read sharing into a single request + transition(BBS_S, {RdBlk, RdBlkS}) { + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + // Wait for probe acks to be complete + transition(BBS_S, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBS_S, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + // Window for merging complete with this transition + // Send responses to all outstanding + transition(BBS_S, ProbeAcksComplete, BB_S) { + sCS_sendCollectiveResponseS; + pt_popTriggerQueue; + } + + transition(BB_S, CoreUnblock, BB_S) { + m_addUnlockerToSharers; + j_popIncomingUnblockQueue; + } + + transition(BB_S, LastCoreUnblock, S) { + m_addUnlockerToSharers; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(O, {RdBlk, RdBlkS}, BBO_O){TagArrayRead} { + t_allocateTBE; + pso_probeSharedDataOwner; + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + // Merging of read sharing into a single request + transition(BBO_O, {RdBlk, RdBlkS}) { + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + + // Wait for probe acks to be complete + transition(BBO_O, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBO_O, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + // Window for merging complete with this transition + // Send responses to all outstanding + transition(BBO_O, ProbeAcksComplete, BB_OO) { + sCS_sendCollectiveResponseS; + pt_popTriggerQueue; + } + + transition(BB_OO, CoreUnblock) { + m_addUnlockerToSharers; + j_popIncomingUnblockQueue; + } + + transition(BB_OO, LastCoreUnblock, O){TagArrayWrite} { + m_addUnlockerToSharers; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(S, CPUWrite, BW_S){TagArrayRead} { + t_allocateTBE; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(E, CPUWrite, BW_E){TagArrayRead} { + t_allocateTBE; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(O, CPUWrite, BW_O){TagArrayRead} { + t_allocateTBE; + rCo_removeCoreFromOwner; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(M, CPUWrite, BW_M){TagArrayRead} { + t_allocateTBE; + rCo_removeCoreFromOwner; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(BW_S, TCCUnblock_Sharer, S){TagArrayWrite} { + aT_addTCCToSharers; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_S, TCCUnblock_NotValid, S){TagArrayWrite} { + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_E, TCCUnblock, E){TagArrayWrite} { + cc_clearSharers; + aT_addTCCToSharers; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_E, TCCUnblock_NotValid, E) { + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_M, TCCUnblock, M) { + c_clearOwner; + cc_clearSharers; + eT_ownerIsUnblocker; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_M, TCCUnblock_NotValid, M) { + // Note this transition should only be executed if we received a stale wb + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_O, TCCUnblock, O) { + c_clearOwner; + eT_ownerIsUnblocker; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_O, TCCUnblock_NotValid, O) { + // Note this transition should only be executed if we received a stale wb + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + // We lost the owner likely do to an invalidation racing with a 'O' wb + transition(BW_O, TCCUnblock_Sharer, S) { + c_clearOwner; + aT_addTCCToSharers; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition({BW_M, BW_S, BW_E, BW_O}, {PrbInv,PrbInvData,PrbShrData}) { + ww_recycleProbeRequest; + } + + transition(BRWD_I, {PrbInvData, PrbInv, PrbShrData}) { + ww_recycleProbeRequest; + } + + // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD + transition(S, CtoD, BBS_UM) {TagArrayRead} { + t_allocateTBE; + lpc_probeInvCore; + i2_probeInvL2; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBS_UM, CPUPrbResp, BBS_UM) { + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBS_UM, TCCPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBS_UM, ProbeAcksComplete, S_M) { + rU_rememberUpgrade; + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD + transition(O, CtoD, BBO_UM){TagArrayRead} { + t_allocateTBE; + lpc_probeInvCore; + i2_probeInvL2; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBO_UM, CPUPrbResp, BBO_UM) { + ruo_rememberUntransferredOwner; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBO_UM, TCCPrbResp) { + ruoT_rememberUntransferredOwnerTCC; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBO_UM, ProbeAcksComplete, O_M) { + rU_rememberUpgrade; + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + transition({S,E}, RdBlkM, BBS_M){TagArrayWrite} { + t_allocateTBE; + ldc_probeInvCoreData; + ld2_probeInvL2Data; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBS_M, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + rR_removeResponderFromSharers; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBS_M, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBS_M, ProbeAcksComplete, S_M) { + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + transition(O, RdBlkM, BBO_M){TagArrayRead} { + t_allocateTBE; + ldc_probeInvCoreData; + ld2_probeInvL2Data; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBO_M, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + rR_removeResponderFromSharers; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBO_M, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBO_M, ProbeAcksComplete, O_M) { + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + // + transition(M, RdBlkM, BBM_M){TagArrayRead} { + t_allocateTBE; + ldc_probeInvCoreData; + ld2_probeInvL2Data; + i_popIncomingRequestQueue; + } + + transition(BBM_M, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + // TCP recalled block before receiving probe + transition({BBM_M, BBS_M, BBO_M}, {CPUWrite,NoCPUWrite}) { + zz_recycleRequest; + } + + transition(BBM_M, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBM_M, ProbeAcksComplete, BB_M) { + sM_sendResponseM; + pt_popTriggerQueue; + } + + transition(BB_M, CoreUnblock, M){TagArrayWrite} { + e_ownerIsUnblocker; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(M, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} { + t_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + i_popIncomingRequestQueue; + } + + transition(E, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} { + t_allocateTBE; + eto_moveExSharerToOwner; + sc_probeShrCoreData; + s2_probeShrL2Data; + i_popIncomingRequestQueue; + } + + transition(BBM_O, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + transition(BBM_O, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + transition(BBM_O, ProbeAcksComplete, BB_O) { + sS_sendResponseS; + pt_popTriggerQueue; + } + + transition(BB_O, CoreUnblock, O){TagArrayWrite} { + as_addToSharers; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition({BBO_O, BBM_M, BBS_S, BBM_O, BB_M, BB_O, BB_S, BBO_UM, BBS_UM, BBS_M, BBO_M, BB_OO}, {PrbInvData, PrbInv,PrbShrData}) { + ww_recycleProbeRequest; + } + + transition({BBM_O, BBS_S, CP_S, CP_O, CP_SM, CP_OM, BBO_O}, {CPUWrite,NoCPUWrite}) { + zz_recycleRequest; + } + + // stale CtoD raced with external invalidation + transition({I, CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, CtoD) { + i_popIncomingRequestQueue; + } + + // stale CtoD raced with internal RdBlkM + transition({BBM_M, BBS_M, BBO_M, BBB_M, BBS_UM, BBO_UM}, CtoD) { + i_popIncomingRequestQueue; + } + + transition({E, M}, CtoD) { + i_popIncomingRequestQueue; + } + + + // TCC-directory has sent out (And potentially received acks for) probes. + // TCP/SQC replacement (known to be stale subsequent) are popped off. + transition({BBO_UM, BBS_UM}, {CPUWrite,NoCPUWrite}) { + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + transition(S_M, {NoCPUWrite, CPUWrite}) { + zz_recycleRequest; + } + + transition(O_M, {NoCPUWrite, CPUWrite}) { + zz_recycleRequest; + } + + + transition({BBM_M, BBS_M, BBO_M, BBO_UM, BBS_UM}, {VicDirty, VicClean, VicDirtyLast, NoVic}) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition({CP_S, CP_O, CP_OM, CP_SM}, {VicDirty, VicClean, VicDirtyLast, CancelWB, NoVic}) { + yy_recycleTCCRequestQueue; + } + + // However, when TCCdir has sent out PrbSharedData, one cannot ignore. + transition({BBS_S, BBO_O, BBM_O, S_M, O_M, BBB_M, BBB_S, BBB_E}, {VicDirty, VicClean, VicDirtyLast,CancelWB}) { + yy_recycleTCCRequestQueue; + } + + transition({BW_S,BW_E,BW_O, BW_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) { + yy_recycleTCCRequestQueue; + } + + transition({BW_S,BW_E,BW_O, BW_M}, CancelWB) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + + /// recycle if waiting for unblocks. + transition({BB_M,BB_O,BB_S,BB_OO}, {VicDirty, VicClean, VicDirtyLast,NoVic,CancelWB}) { + yy_recycleTCCRequestQueue; + } + + transition({BBS_S, BBO_O}, NoVic) { + rT_removeTCCFromSharers; + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + // stale. Pop message and send dummy ack. + transition({I_S, I_ES, I_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition(M, VicDirtyLast, VM_I){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition(E, VicDirty, VM_I){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition(O, VicDirty, VO_S){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition(O, {VicDirtyLast, VicClean}, VO_I){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition({E, S}, VicClean, VES_I){TagArrayRead} { + tv_allocateTBE; + vc_victim; + pl_popTCCRequestQueue; + } + + transition({O, S}, NoVic){TagArrayRead} { + rT_removeTCCFromSharers; + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition({O,S}, NoCPUWrite){TagArrayRead} { + rC_removeCoreFromSharers; + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + transition({M,E}, NoCPUWrite){TagArrayRead} { + rC_removeCoreFromSharers; + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + // This can only happen if it is race. (TCCdir sent out probes which caused this cancel in the first place.) + transition({VM_I, VES_I, VO_I}, CancelWB) { + pl_popTCCRequestQueue; + } + + transition({VM_I, VES_I, VO_I}, NB_AckWB, I){TagArrayWrite} { + c_clearOwner; + cc_clearSharers; + wb_data; + fw2_forwardWBAck; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + transition(VO_S, NB_AckWB, S){TagArrayWrite} { + c_clearOwner; + wb_data; + fw2_forwardWBAck; + dt_deallocateTBE; + pR_popResponseFromNBQueue; + } + + transition(I_C, NB_AckWB, I){TagArrayWrite} { + c_clearOwner; + cc_clearSharers; + ss_sendStaleNotification; + fw2_forwardWBAck; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + transition(I_W, NB_AckWB, I) { + ss_sendStaleNotification; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + + + // Do not handle replacements, reads of any kind or writebacks from transients; recycle + transition({I_M, I_ES, I_S, MO_I, ES_I, S_M, O_M, VES_I, VO_I, VO_S, VM_I, I_C, I_W}, {RdBlkS,RdBlkM,RdBlk,CtoD}) { + zz_recycleRequest; + } + + transition( VO_S, NoCPUWrite) { + zz_recycleRequest; + } + + transition({BW_M, BW_S, BW_O, BW_E}, {RdBlkS,RdBlkM,RdBlk,CtoD,NoCPUWrite, CPUWrite}) { + zz_recycleRequest; + } + + transition({BBB_M, BBB_S, BBB_E, BB_O, BB_M, BB_S, BB_OO}, { RdBlk, RdBlkS, RdBlkM, CPUWrite, NoCPUWrite}) { + zz_recycleRequest; + } + + transition({BBB_S, BBB_E, BB_O, BB_S, BB_OO}, { CtoD}) { + zz_recycleRequest; + } + + transition({BBS_UM, BBO_UM, BBM_M, BBM_O, BBS_M, BBO_M}, { RdBlk, RdBlkS, RdBlkM}) { + zz_recycleRequest; + } + + transition(BBM_O, CtoD) { + zz_recycleRequest; + } + + transition({BBS_S, BBO_O}, {RdBlkM, CtoD}) { + zz_recycleRequest; + } + + transition({B_I, CP_I, CP_S, CP_O, CP_OM, CP_SM, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {RdBlk, RdBlkS, RdBlkM}) { + zz_recycleRequest; + } + + transition({CP_O, CP_S, CP_OM}, CtoD) { + zz_recycleRequest; + } + + // Ignore replacement related messages after probe got in. + transition({CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {CPUWrite, NoCPUWrite}) { + zz_recycleRequest; + } + + // Ignore replacement related messages after probes processed + transition({I, I_S, I_ES, I_M, I_C, I_W}, {CPUWrite,NoCPUWrite}) { + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + // cannot ignore cancel... otherwise TCP/SQC will be stuck in I_C + transition({I, I_S, I_ES, I_M, I_C, I_W, S_M, M, O, E, S}, CPUWriteCancel){TagArrayRead} { + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, {NoVic, VicClean, VicDirty, VicDirtyLast}){ + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + // Handling Probes from NB (General process: (1) propagate up, go to blocking state (2) process acks (3) on last ack downward.) + + // step 1 + transition({M, O, E, S}, PrbInvData, CP_I){TagArrayRead} { + tp_allocateTBE; + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + // step 2a + transition(CP_I, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_I, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_I, ProbeAcksComplete, I){TagArrayWrite} { + pd_sendProbeResponseData; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // step 1 + transition({M, O, E, S}, PrbInv, B_I){TagArrayWrite} { + tp_allocateTBE; + ipc_probeInvCore; + i2_probeInvL2; + pp_popProbeQueue; + } + // step 2 + transition(B_I, CPUPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(B_I, TCCPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(B_I, ProbeAcksComplete, I){TagArrayWrite} { + // send response down to NB + pi_sendProbeResponseInv; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + + // step 1 + transition({M, O}, PrbShrData, CP_O){TagArrayRead} { + tp_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + + transition(E, PrbShrData, CP_O){TagArrayRead} { + tp_allocateTBE; + eto_moveExSharerToOwner; + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + // step 2 + transition(CP_O, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_O, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_O, ProbeAcksComplete, O){TagArrayWrite} { + // send response down to NB + pd_sendProbeResponseData; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + //step 1 + transition(S, PrbShrData, CP_S) { + tp_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + // step 2 + transition(CP_S, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_S, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_S, ProbeAcksComplete, S) { + // send response down to NB + pd_sendProbeResponseData; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + // step 1 + transition(O_M, PrbInvData, CP_IOM) { + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + // step 2a + transition(CP_IOM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_IOM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_IOM, ProbeAcksComplete, I_M) { + pdm_sendProbeResponseDataMs; + c_clearOwner; + cc_clearSharers; + cd_clearDirtyBitTBE; + pt_popTriggerQueue; + } + + transition(CP_IOM, ProbeAcksCompleteReissue, I){TagArrayWrite} { + pdm_sendProbeResponseDataMs; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // step 1 + transition(S_M, PrbInvData, CP_ISM) { + dc_probeInvCoreData; + d2_probeInvL2Data; + o_checkForAckCompletion; + pp_popProbeQueue; + } + // step 2a + transition(CP_ISM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_ISM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_ISM, ProbeAcksComplete, I_M) { + pdm_sendProbeResponseDataMs; + c_clearOwner; + cc_clearSharers; + cd_clearDirtyBitTBE; + + //dt_deallocateTBE; + pt_popTriggerQueue; + } + transition(CP_ISM, ProbeAcksCompleteReissue, I){TagArrayWrite} { + pim_sendProbeResponseInvMs; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // step 1 + transition({S_M, O_M}, {PrbInv}, CP_ISM) { + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + // next steps inherited from BS_ISM + + // Simpler cases + + transition({I_C, I_W}, {PrbInvData, PrbInv, PrbShrData}) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + //If the directory is certain that the block is not present, one can send an acknowledgement right away. + // No need for three step process. + transition(I, {PrbInv,PrbShrData,PrbInvData}){TagArrayRead} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({I_M, I_ES, I_S}, {PrbInv, PrbInvData}) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({I_M, I_ES, I_S}, PrbShrData) { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + //step 1 + transition(S_M, PrbShrData, CP_SM) { + sc_probeShrCoreData; + s2_probeShrL2Data; + o_checkForAckCompletion; + pp_popProbeQueue; + } + // step 2 + transition(CP_SM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_SM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_SM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, S_M){DataArrayRead} { + // send response down to NB + pd_sendProbeResponseData; + pt_popTriggerQueue; + } + + //step 1 + transition(O_M, PrbShrData, CP_OM) { + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + // step 2 + transition(CP_OM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_OM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_OM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, O_M) { + // send response down to NB + pd_sendProbeResponseData; + pt_popTriggerQueue; + } + + transition(BRW_I, PrbInvData, I_W) { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({VM_I,VO_I}, PrbInvData, I_C) { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(VES_I, {PrbInvData,PrbInv}, I_C) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({VM_I, VO_I, BRW_I}, PrbInv, I_W) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({VM_I, VO_I, VO_S, VES_I, BRW_I}, PrbShrData) { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } + + transition(VO_S, PrbInvData, CP_OSIW) { + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + + transition(CP_OSIW, TCCPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + transition(CP_OSIW, CPUPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(CP_OSIW, ProbeAcksComplete, I_C) { + pd_sendProbeResponseData; + cd_clearDirtyBitTBE; + pt_popTriggerQueue; + } + + transition({I, S, E, O, M, CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W}, StaleVic) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, StaleVic) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + // Recall Transistions + // transient states still require the directory state + transition({M, O}, Recall, BRWD_I) { + tr_allocateTBE; + vd_victim; + dc_probeInvCoreData; + d2_probeInvL2Data; + } + + transition({E, S}, Recall, BRWD_I) { + tr_allocateTBE; + vc_victim; + dc_probeInvCoreData; + d2_probeInvL2Data; + } + + transition(I, Recall) { + dd_deallocateDir; + } + + transition({BRWD_I, BRD_I}, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition({BRWD_I, BRD_I}, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BRWD_I, NB_AckWB, BRD_I) { + pR_popResponseFromNBQueue; + } + + transition(BRWD_I, ProbeAcksComplete, BRW_I) { + pt_popTriggerQueue; + } + + transition(BRW_I, NB_AckWB, I) { + wb_data; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + transition(BRD_I, ProbeAcksComplete, I) { + wb_data; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // wait for stable state for Recall + transition({BRWD_I,BRD_I,BRW_I,CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W, CP_I}, Recall) { + zz_recycleRequest; // stall and wait would be for the wrong address + ut_updateTag; // try to find an easier recall + } + +} diff --git a/src/mem/protocol/GPU_RfO-TCP.sm b/src/mem/protocol/GPU_RfO-TCP.sm new file mode 100644 index 000000000..6cf9224a6 --- /dev/null +++ b/src/mem/protocol/GPU_RfO-TCP.sm @@ -0,0 +1,1009 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") + : GPUCoalescer* coalescer; + Sequencer* sequencer; + bool use_seq_not_coal; + CacheMemory * L1cache; + int TCC_select_num_bits; + Cycles issue_latency := 40; // time to send data down to TCC + Cycles l2_hit_latency := 18; + + MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock"; + + MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response"; + + MessageBuffer * mandatoryQueue; +{ + state_declaration(State, desc="TCP Cache States", default="TCP_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + E, AccessPermission:Read_Write, desc="Exclusive"; + O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line"; + M, AccessPermission:Read_Write, desc="Modified"; + + I_M, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_ES, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + S_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + + ES_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack"; + MO_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for dirty WB ack"; + + MO_PI, AccessPermission:Read_Only, desc="L1 downgrade, waiting for CtoD ack (or ProbeInvalidateData)"; + + I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCC for canceled WB"; + } + + enumeration(Event, desc="TCP Events") { + // Core initiated + Load, desc="Load"; + Store, desc="Store"; + + // TCC initiated + TCC_AckS, desc="TCC Ack to Core Request"; + TCC_AckE, desc="TCC Ack to Core Request"; + TCC_AckM, desc="TCC Ack to Core Request"; + TCC_AckCtoD, desc="TCC Ack to Core Request"; + TCC_AckWB, desc="TCC Ack for clean WB"; + TCC_NackWB, desc="TCC Nack for clean WB"; + + // Mem sys initiated + Repl, desc="Replacing block from cache"; + + // Probe Events + PrbInvData, desc="probe, return O or M data"; + PrbInv, desc="probe, no need for data"; + LocalPrbInv, desc="local probe, no need for data"; + PrbShrData, desc="probe downgrade, return O or M data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCP_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCP_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + bool isValid(Addr addr) { + AccessPermission perm := getAccessPermission(addr); + if (perm == AccessPermission:NotPresent || + perm == AccessPermission:Invalid || + perm == AccessPermission:Busy) { + return false; + } else { + return true; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCP_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + MachineType getCoherenceType(MachineID myMachID, + MachineID senderMachID) { + if(myMachID == senderMachID) { + return MachineType:TCP; + } else if(machineIDToMachineType(senderMachID) == MachineType:TCP) { + return MachineType:L1Cache_wCC; + } else if(machineIDToMachineType(senderMachID) == MachineType:TCC) { + return MachineType:TCC; + } else { + return MachineType:TCCdir; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromTCP); + out_port(responseNetwork_out, ResponseMsg, responseFromTCP); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // In Ports + + in_port(probeNetwork_in, TDProbeRequestMsg, probeToTCP) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") { + DPRINTF(RubySlicc, "%s\n", in_msg); + DPRINTF(RubySlicc, "machineID: %s\n", machineID); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + if(in_msg.localCtoD) { + trigger(Event:LocalPrbInv, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(responseToTCP_in, ResponseMsg, responseToTCP) { + if (responseToTCP_in.isReady(clockEdge())) { + peek(responseToTCP_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + if (in_msg.State == CoherenceState:Modified) { + if (in_msg.CtoD) { + trigger(Event:TCC_AckCtoD, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:TCC_AckM, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:TCC_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) { + trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) { + trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + DPRINTF(RubySlicc, "%s\n", in_msg); + if (in_msg.Type == RubyRequestType:LD) { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(vd_victim, "vd", desc="Victimize M/O Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Dirty := cache_entry.Dirty; + } + } + + action(vc_victim, "vc", desc="Victimize E/S Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToTCP_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + sequencer.readCallback(address, cache_entry.DataBlk, + false, MachineType:TCP); + } else { + coalescer.readCallback(address, MachineType:TCP, cache_entry.DataBlk); + } + } + + action(xl_loadDone, "xl", desc="remote load done") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + coalescer.recordCPReadCallBack(machineID, in_msg.Sender); + sequencer.readCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } else { + MachineType cc_mach_type := getCoherenceType(machineID, + in_msg.Sender); + coalescer.readCallback(address, + cc_mach_type, + cache_entry.DataBlk, + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + } + + action(s_storeDone, "s", desc="local store done") { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + coalescer.recordCPWriteCallBack(machineID, machineID); + sequencer.writeCallback(address, cache_entry.DataBlk, + false, MachineType:TCP); + } else { + coalescer.writeCallback(address, MachineType:TCP, cache_entry.DataBlk); + } + cache_entry.Dirty := true; + } + + action(xs_storeDone, "xs", desc="remote store done") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + coalescer.recordCPWriteCallBack(machineID, in_msg.Sender); + sequencer.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } else { + MachineType cc_mach_type := getCoherenceType(machineID, + in_msg.Sender); + coalescer.writeCallback(address, + cc_mach_type, + cache_entry.DataBlk, + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + cache_entry.Dirty := true; + } + } + + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToTCP_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToTCP_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(piu_sendProbeResponseInvUntransferredOwnership, "piu", desc="send probe ack inv, no data, retain ownership") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.UntransferredOwner :=true; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + APPEND_TRANSITION_COMMENT("Sending ack with dirty "); + APPEND_TRANSITION_COMMENT(out_msg.Dirty); + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + APPEND_TRANSITION_COMMENT("Sending ack with dirty "); + APPEND_TRANSITION_COMMENT(out_msg.Dirty); + DPRINTF(RubySlicc, "Data is %s\n", out_msg.DataBlk); + } + } + + action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(mru_updateMRU, "mru", desc="Touch block for replacement policy") { + L1cache.setMRU(address); + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.wasValid := isValid(address); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + // Transitions + + // transitions from base + transition(I, Load, I_ES) {TagArrayRead} { + a_allocate; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, Store, I_M) {TagArrayRead, TagArrayWrite} { + a_allocate; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(S, Store, S_M) {TagArrayRead} { + mru_updateMRU; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(E, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + transition(O, Store, O_M) {TagArrayRead, DataArrayWrite} { + mru_updateMRU; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(M, Store) {TagArrayRead, DataArrayWrite} { + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + // simple hit transitions + transition({S, E, O, M}, Load) {TagArrayRead, DataArrayRead} { + l_loadDone; + mru_updateMRU; + p_popMandatoryQueue; + } + + // recycles from transients + transition({I_M, I_ES, ES_I, MO_I, S_M, O_M, MO_PI, I_C}, {Load, Store, Repl}) {} { + zz_recycleMandatoryQueue; + } + + transition({S, E}, Repl, ES_I) {TagArrayRead} { + t_allocateTBE; + vc_victim; + ic_invCache; + } + + transition({O, M}, Repl, MO_I) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + vd_victim; + ic_invCache; + } + + // TD event transitions + transition(I_M, {TCC_AckM, TCC_AckCtoD}, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + w_writeCache; + xs_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, TCC_AckS, S) {TagArrayWrite, DataArrayWrite} { + w_writeCache; + xl_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, TCC_AckE, E) {TagArrayWrite, DataArrayWrite} { + w_writeCache; + xl_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({S_M, O_M}, TCC_AckM, M) {TagArrayWrite, DataArrayWrite} { + xs_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({MO_I, ES_I}, TCC_NackWB, I){TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + transition({MO_I, ES_I}, TCC_AckWB, I) {TagArrayWrite, DataArrayRead} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_AckWB, I) {TagArrayWrite} { + ss_sendStaleNotification; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_NackWB, I) {TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + // Probe transitions + transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(I, PrbInvData) {TagArrayRead, TagArrayWrite} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition({E, S}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(I_C, PrbInvData, I_C) {} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + // Needed for TCC-based protocols. Must hold on to ownership till transfer complete + transition({M, O}, LocalPrbInv, MO_PI){TagArrayRead, TagArrayWrite} { + piu_sendProbeResponseInvUntransferredOwnership; + pp_popProbeQueue; + } + + // If there is a race and we see a probe invalidate, handle normally. + transition(MO_PI, PrbInvData, I){TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(MO_PI, PrbInv, I){TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + // normal exit when ownership is successfully transferred + transition(MO_PI, TCC_AckCtoD, I) {TagArrayWrite} { + ic_invCache; + pr_popResponseQueue; + } + + transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({E, S, I}, LocalPrbInv, I){TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + + transition({M, E, O}, PrbShrData, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(MO_PI, PrbShrData) {DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + + transition(S, PrbShrData, S) {TagArrayRead, DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({I, I_C}, PrbShrData) {TagArrayRead} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({I_M, I_ES}, {PrbInv, PrbInvData}){TagArrayRead} { + pi_sendProbeResponseInv; + ic_invCache; + a_allocate; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M, I_ES}, PrbShrData) {} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(S_M, PrbInvData, I_M) {TagArrayRead} { + pim_sendProbeResponseInvMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition(O_M, PrbInvData, I_M) {TagArrayRead,DataArrayRead} { + pdm_sendProbeResponseDataMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition({S_M, O_M}, {PrbInv}, I_M) {TagArrayRead} { + pim_sendProbeResponseInvMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition(S_M, {LocalPrbInv}, I_M) {TagArrayRead} { + pim_sendProbeResponseInvMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition(O_M, LocalPrbInv, I_M) {TagArrayRead} { + piu_sendProbeResponseInvUntransferredOwnership; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition({S_M, O_M}, PrbShrData) {DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(ES_I, PrbInvData, I_C){ + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) {DataArrayRead} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(ES_I, PrbInv, I_C) { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(ES_I, PrbShrData, ES_I) {DataArrayRead} { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_I, PrbShrData, MO_I) {DataArrayRead} { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } + +} diff --git a/src/mem/protocol/GPU_RfO.slicc b/src/mem/protocol/GPU_RfO.slicc new file mode 100644 index 000000000..7773ce6e0 --- /dev/null +++ b/src/mem/protocol/GPU_RfO.slicc @@ -0,0 +1,11 @@ +protocol "GPU_AMD_Base"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-dir.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_RfO-TCP.sm"; +include "GPU_RfO-SQC.sm"; +include "GPU_RfO-TCC.sm"; +include "GPU_RfO-TCCdir.sm"; +include "MOESI_AMD_Base-L3cache.sm"; +include "MOESI_AMD_Base-RegionBuffer.sm"; diff --git a/src/mem/protocol/GPU_VIPER-SQC.sm b/src/mem/protocol/GPU_VIPER-SQC.sm new file mode 100644 index 000000000..8d5b5699a --- /dev/null +++ b/src/mem/protocol/GPU_VIPER-SQC.sm @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Blake Hechtman + */ + +machine(MachineType:SQC, "GPU SQC (L1 I Cache)") + : Sequencer* sequencer; + CacheMemory * L1cache; + int TCC_select_num_bits; + Cycles issue_latency := 80; // time to send data down to TCC + Cycles l2_hit_latency := 18; // for 1MB L2, 20 for 2MB + + MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request"; + + MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response"; + + MessageBuffer * mandatoryQueue; +{ + state_declaration(State, desc="SQC Cache States", default="SQC_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + V, AccessPermission:Read_Only, desc="Valid"; + } + + enumeration(Event, desc="SQC Events") { + // Core initiated + Fetch, desc="Fetch"; + // Mem sys initiated + Repl, desc="Replacing block from cache"; + Data, desc="Received Data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Tick clockEdge(); + + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return SQC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return SQC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(SQC_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromSQC); + + // In Ports + + in_port(responseToSQC_in, ResponseMsg, responseToSQC) { + if (responseToSQC_in.isReady(clockEdge())) { + peek(responseToSQC_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) { + trigger(Event:Data, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.addr); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + assert(in_msg.Type == RubyRequestType:IFETCH); + trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToSQC_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); + } + + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToSQC_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := false; + } + } + + // Transitions + + // transitions from base + transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} { + ic_invCache + } + + transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} { + a_allocate; + w_writeCache + l_loadDone; + pr_popResponseQueue; + } + + transition(I, Fetch) {TagArrayRead, TagArrayWrite} { + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // simple hit transitions + transition(V, Fetch) {TagArrayRead, DataArrayRead} { + l_loadDone; + p_popMandatoryQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER-TCC.sm b/src/mem/protocol/GPU_VIPER-TCC.sm new file mode 100644 index 000000000..f62df9f4f --- /dev/null +++ b/src/mem/protocol/GPU_VIPER-TCC.sm @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Blake Hechtman + */ + +machine(MachineType:TCC, "TCC Cache") + : CacheMemory * L2cache; + bool WB; /*is this cache Writeback?*/ + Cycles l2_request_latency := 50; + Cycles l2_response_latency := 20; + + // From the TCPs or SQCs + MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request"; + // To the Cores. TCC deals only with TCPs/SQCs. + MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response"; + // From the NB + MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response"; + // To the NB + MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * triggerQueue; + +{ + // EVENTS + enumeration(Event, desc="TCC Events") { + // Requests coming from the Cores + RdBlk, desc="RdBlk event"; + WrVicBlk, desc="L1 Write Through"; + WrVicBlkBack, desc="L1 Write Through(dirty cache)"; + Atomic, desc="Atomic Op"; + AtomicDone, desc="AtomicOps Complete"; + AtomicNotDone, desc="AtomicOps not Complete"; + Data, desc="data messgae"; + // Coming from this TCC + L2_Repl, desc="L2 Replacement"; + // Probes + PrbInv, desc="Invalidating probe"; + // Coming from Memory Controller + WBAck, desc="writethrough ack from memory"; + } + + // STATES + state_declaration(State, desc="TCC State", default="TCC_State_I") { + M, AccessPermission:Read_Write, desc="Modified(dirty cache only)"; + W, AccessPermission:Read_Write, desc="Written(dirty cache only)"; + V, AccessPermission:Read_Only, desc="Valid"; + I, AccessPermission:Invalid, desc="Invalid"; + IV, AccessPermission:Busy, desc="Waiting for Data"; + WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack"; + A, AccessPermission:Busy, desc="Invalid waiting on atomici Data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + WriteMask writeMask, desc="Dirty byte mask"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + NetDest Destination, desc="Data destination"; + int numAtomics, desc="number remaining atomics"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + Tick clockEdge(); + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCC_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + + // ** OUT_PORTS ** + + // Three classes of ports + // Class 1: downward facing network links to NB + out_port(requestToNB_out, CPURequestMsg, requestToNB); + out_port(responseToNB_out, ResponseMsg, responseToNB); + out_port(unblockToNB_out, UnblockMsg, unblockToNB); + + // Class 2: upward facing ports to GPU cores + out_port(responseToCore_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + // + // request queue going to NB + // + + +// ** IN_PORTS ** + in_port(triggerQueue_in, TiggerMsg, triggerQueue) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (tbe.numAtomics == 0) { + trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + + in_port(responseFromNB_in, ResponseMsg, responseFromNB) { + if (responseFromNB_in.isReady(clockEdge())) { + peek(responseFromNB_in, ResponseMsg, block_on="addr") { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:Data, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Finally handling incoming requests (from TCP) and probes (from NB). + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + DPRINTF(RubySlicc, "%s\n", in_msg); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } + } + + in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) { + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + if(WB) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Unexpected Response Message to Core"); + } + } + } + } + // BEGIN ACTIONS + + action(i_invL2, "i", desc="invalidate TCC cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(sd_sendData, "sd", desc="send Shared response") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + action(sdr_sendDataResponse, "sdr", desc="send Shared response") { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination := tbe.Destination; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + + action(rd_requestData, "r", desc="Miss in L2, pass on") { + if(tbe.Destination.count()==1){ + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(swb_sendWBAck, "swb", desc="send WB Ack") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.DataBlk := in_msg.DataBlk; + } + } + } + + action(a_allocateBlock, "a", desc="allocate TCC block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + cache_entry.writeMask.clear(); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + if (is_invalid(tbe)) { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.Destination.clear(); + tbe.numAtomics := 0; + } + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){ + tbe.Destination.add(in_msg.Requestor); + } + } + } + } + + action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { + tbe.Destination.clear(); + TBEs.deallocate(address); + unset_tbe(); + } + + action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") { + peek(responseFromNB_in, ResponseMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); + cache_entry.writeMask.orMask(in_msg.writeMask); + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wt_writeThrough, "wt", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(wb_writeBack, "wb", desc="write back data") { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.orMask(cache_entry.writeMask); + } + } + + action(at_atomicThrough, "at", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:Atomic; + out_msg.Dirty := true; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L2cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + coreRequestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseFromNB_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(z_stall, "z", desc="stall") { + // built-in + } + + + action(ina_incrementNumAtomics, "ina", desc="inc num atomics") { + tbe.numAtomics := tbe.numAtomics + 1; + } + + + action(dna_decrementNumAtomics, "dna", desc="inc num atomics") { + tbe.numAtomics := tbe.numAtomics - 1; + if (tbe.numAtomics==0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AtomicDone; + } + } + } + + action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") { + triggerQueue_in.dequeue(clockEdge()); + } + + // END ACTIONS + + // BEGIN TRANSITIONS + // transitions from base + // Assumptions for ArrayRead/Write + // TBE checked before tags + // Data Read/Write requires Tag Read + + // Stalling transitions do NOT check the tag array...and if they do, + // they can cause a resource stall deadlock! + + transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} { + z_stall; + } + transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} { + z_stall; + } + transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} { + z_stall; + } + transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} { + sd_sendData; + ut_updateTag; + p_popRequestQueue; + } + transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + transition(I, RdBlk, IV) {TagArrayRead} { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition(IV, RdBlk) { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition({V, I},Atomic, A) {TagArrayRead} { + i_invL2; + t_allocateTBE; + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition(A, Atomic) { + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition({M, W}, Atomic, WI) {TagArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + transition(I, WrVicBlk) {TagArrayRead} { + wt_writeThrough; + p_popRequestQueue; + } + + transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} { + ut_updateTag; + wdb_writeDirtyBytes; + wt_writeThrough; + p_popRequestQueue; + } + + transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + i_invL2; + } + + transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} { + i_invL2; + } + + transition({A, IV, WI}, L2_Repl) { + i_invL2; + } + + transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(W, PrbInv) {TagArrayRead} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({A, IV, WI}, PrbInv) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + wcb_writeCacheBlock; + sdr_sendDataResponse; + pr_popResponseQueue; + dt_deallocateTBE; + } + + transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ar_sendAtomicResponse; + dna_decrementNumAtomics; + pr_popResponseQueue; + } + + transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} { + dt_deallocateTBE; + ptr_popTriggerQueue; + } + + transition(A, AtomicNotDone) {TagArrayRead} { + ptr_popTriggerQueue; + } + + //M,W should not see WBAck as the cache is in WB mode + //WBAcks do not need to check tags + transition({I, V, IV, A}, WBAck) { + w_sendResponseWBAck; + pr_popResponseQueue; + } + + transition(WI, WBAck,I) { + dt_deallocateTBE; + pr_popResponseQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER-TCP.sm b/src/mem/protocol/GPU_VIPER-TCP.sm new file mode 100644 index 000000000..d81196b17 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER-TCP.sm @@ -0,0 +1,747 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Blake Hechtman + */ + +machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") + : VIPERCoalescer* coalescer; + Sequencer* sequencer; + bool use_seq_not_coal; + CacheMemory * L1cache; + bool WB; /*is this cache Writeback?*/ + bool disableL1; /* bypass L1 cache? */ + int TCC_select_num_bits; + Cycles issue_latency := 40; // time to send data down to TCC + Cycles l2_hit_latency := 18; + + MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock"; + + MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response"; + MessageBuffer * mandatoryQueue; + +{ + state_declaration(State, desc="TCP Cache States", default="TCP_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + V, AccessPermission:Read_Only, desc="Valid"; + W, AccessPermission:Read_Write, desc="Written"; + M, AccessPermission:Read_Write, desc="Written and Valid"; + L, AccessPermission:Read_Write, desc="Local access is modifable"; + A, AccessPermission:Invalid, desc="Waiting on Atomic"; + } + + enumeration(Event, desc="TCP Events") { + // Core initiated + Load, desc="Load"; + Store, desc="Store to L1 (L1 is dirty)"; + StoreThrough, desc="Store directly to L2(L1 is clean)"; + StoreLocal, desc="Store to L1 but L1 is clean"; + Atomic, desc="Atomic"; + Flush, desc="Flush if dirty(wbL1 for Store Release)"; + Evict, desc="Evict if clean(invL1 for Load Acquire)"; + // Mem sys initiated + Repl, desc="Replacing block from cache"; + + // TCC initiated + TCC_Ack, desc="TCC Ack to Core Request"; + TCC_AckWB, desc="TCC Ack for WB"; + // Disable L1 cache + Bypass, desc="Bypass the entire L1 cache"; + } + + enumeration(RequestType, + desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + TagArrayFlash, desc="Flash clear the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + WriteMask writeMask, desc="written bytes masks"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + int WTcnt, default="0"; + int Fcnt, default="0"; + bool inFlush, default="false"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCP_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCP_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + bool isValid(Addr addr) { + AccessPermission perm := getAccessPermission(addr); + if (perm == AccessPermission:NotPresent || + perm == AccessPermission:Invalid || + perm == AccessPermission:Busy) { + return false; + } else { + return true; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCP_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayFlash) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayFlash) { + // FIXME should check once per cache, rather than once per cacheline + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromTCP); + + // In Ports + + in_port(responseToTCP_in, ResponseMsg, responseToTCP) { + if (responseToTCP_in.isReady(clockEdge())) { + peek(responseToTCP_in, ResponseMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + // disable L1 cache + if (disableL1) { + trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); + } else { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) { + trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.addr); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck || + in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + DPRINTF(RubySlicc, "%s\n", in_msg); + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:ATOMIC) { + trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:ST) { + if(disableL1) { + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); + } else { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + if (in_msg.segment == HSASegment:SPILL) { + trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe); + } else if (WB) { + trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); + } + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } // end if (disableL1) + } else if (in_msg.Type == RubyRequestType:FLUSH) { + trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:REPLACEMENT){ + trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe); + } else { + error("Unexpected Request Message from VIC"); + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + if (WB) { + trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); + } + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + cache_entry.writeMask.clear(); + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(rb_bypassDone, "rb", desc="bypass L1 of read access") { + peek(responseToTCP_in, ResponseMsg) { + DataBlock tmp:= in_msg.DataBlk; + if (use_seq_not_coal) { + sequencer.readCallback(address, tmp, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, tmp); + } + if(is_valid(cache_entry)) { + unset_cache_entry(); + } + } + } + + action(wab_bypassDone, "wab", desc="bypass L1 of write access") { + peek(responseToTCP_in, ResponseMsg) { + DataBlock tmp := in_msg.DataBlk; + if (use_seq_not_coal) { + sequencer.writeCallback(address, tmp, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, tmp); + } + } + } + + action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") { + peek(mandatoryQueue_in, RubyRequest){ + if (cache_entry.writeMask.cmpMask(in_msg.writeMask)) { + if (use_seq_not_coal) { + sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + } else { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + } + } + + action(wt_writeThrough, "wt", desc="Flush dirty data") { + WTcnt := WTcnt + 1; + APPEND_TRANSITION_COMMENT("write++ = "); + APPEND_TRANSITION_COMMENT(WTcnt); + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.clear(); + out_msg.writeMask.orMask(cache_entry.writeMask); + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + } + } + + action(at_atomicThrough, "at", desc="send Atomic") { + peek(mandatoryQueue_in, RubyRequest) { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.writeMask.clear(); + out_msg.writeMask.orMask(in_msg.writeMask); + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:Atomic; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + } + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + cache_entry.writeMask.clear(); + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(sf_setFlush, "sf", desc="set flush") { + inFlush := true; + APPEND_TRANSITION_COMMENT(" inFlush is true"); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToTCP_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + } + + action(s_storeDone, "s", desc="local store done") { + assert(is_valid(cache_entry)); + + if (use_seq_not_coal) { + sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + cache_entry.Dirty := true; + } + + action(inv_invDone, "inv", desc="local inv done") { + if (use_seq_not_coal) { + DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n"); + assert(false); + } else { + coalescer.invCallback(address); + } + } + + action(wb_wbDone, "wb", desc="local wb done") { + if (inFlush == true) { + Fcnt := Fcnt + 1; + if (Fcnt > WTcnt) { + if (use_seq_not_coal) { + DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n"); + assert(false); + } else { + coalescer.wbCallback(address); + } + Fcnt := Fcnt - 1; + } + if (WTcnt == 0 && Fcnt == 0) { + inFlush := false; + APPEND_TRANSITION_COMMENT(" inFlush is false"); + } + } + } + + action(wd_wtDone, "wd", desc="writethrough done") { + WTcnt := WTcnt - 1; + if (inFlush == true) { + Fcnt := Fcnt -1; + } + assert(WTcnt >= 0); + APPEND_TRANSITION_COMMENT("write-- = "); + APPEND_TRANSITION_COMMENT(WTcnt); + } + + action(dw_dirtyWrite, "dw", desc="update write mask"){ + peek(mandatoryQueue_in, RubyRequest) { + cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask); + cache_entry.writeMask.orMask(in_msg.writeMask); + } + } + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask); + cache_entry.DataBlk := tmp; + } + } + + action(mru_updateMRU, "mru", desc="Touch block for replacement policy") { + L1cache.setMRU(address); + } + +// action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { +// mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); +// } + + action(z_stall, "z", desc="stall; built-in") { + // built-int action + } + + // Transitions + // ArrayRead/Write assumptions: + // All requests read Tag Array + // TBE allocation write the TagArray to I + // TBE only checked on misses + // Stores will also write dirty bits in the tag + // WriteThroughs still need to use cache entry as staging buffer for wavefront + + // Stalling transitions do NOT check the tag array...and if they do, + // they can cause a resource stall deadlock! + + transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} { + z_stall; + } + + transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} { + l_loadDone; + mru_updateMRU; + p_popMandatoryQueue; + } + + transition(I, Load) {TagArrayRead} { + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + mru_updateMRU; + at_atomicThrough; + p_popMandatoryQueue; + } + + transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} { + wt_writeThrough; + t_allocateTBE; + at_atomicThrough; + ic_invCache; + } + + transition(W, Load, I) {TagArrayRead, DataArrayRead} { + wt_writeThrough; + norl_issueRdBlkOrloadDone; + p_popMandatoryQueue; + } + + transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocate; + dw_dirtyWrite; + s_storeDone; + p_popMandatoryQueue; + } + + transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocate; + dw_dirtyWrite; + s_storeDone; + p_popMandatoryQueue; + } + + transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + //M,W should not see storeThrough + transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocate; + dw_dirtyWrite; + s_storeDone; + wt_writeThrough; + ic_invCache; + p_popMandatoryQueue; + } + + transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + s_storeDone; + wt_writeThrough; + ic_invCache; + p_popMandatoryQueue; + } + + transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { + a_allocate; + w_writeCache; + l_loadDone; + pr_popResponseQueue; + } + + transition(I, Bypass, I) { + rb_bypassDone; + pr_popResponseQueue; + } + + transition(A, Bypass, I){ + d_deallocateTBE; + wab_bypassDone; + pr_popResponseQueue; + } + + transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} { + d_deallocateTBE; + a_allocate; + w_writeCache; + s_storeDone; + pr_popResponseQueue; + ic_invCache; + } + + transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} { + w_writeCache; + l_loadDone; + pr_popResponseQueue; + } + + transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { + w_writeCache; + l_loadDone; + pr_popResponseQueue; + } + + transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} { + ic_invCache; + } + + transition({A}, Repl) {TagArrayRead, TagArrayWrite} { + ic_invCache; + } + + transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + wt_writeThrough; + ic_invCache; + } + + transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + wt_writeThrough; + ic_invCache; + } + + transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + sf_setFlush; + wt_writeThrough; + ic_invCache; + p_popMandatoryQueue; + } + + transition({V, I, A, L},Flush) {TagArrayFlash} { + sf_setFlush; + wb_wbDone; + p_popMandatoryQueue; + } + + transition({I, V}, Evict, I) {TagArrayFlash} { + inv_invDone; + p_popMandatoryQueue; + ic_invCache; + } + + transition({W, M}, Evict, W) {TagArrayFlash} { + inv_invDone; + p_popMandatoryQueue; + } + + transition({A, L}, Evict) {TagArrayFlash} { + inv_invDone; + p_popMandatoryQueue; + } + + // TCC_AckWB only snoops TBE + transition({V, I, A, M, W, L}, TCC_AckWB) { + wd_wtDone; + wb_wbDone; + pr_popResponseQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER.slicc b/src/mem/protocol/GPU_VIPER.slicc new file mode 100644 index 000000000..45f7f3477 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER.slicc @@ -0,0 +1,9 @@ +protocol "GPU_VIPER"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-dir.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_VIPER-TCP.sm"; +include "GPU_VIPER-SQC.sm"; +include "GPU_VIPER-TCC.sm"; +include "MOESI_AMD_Base-L3cache.sm"; diff --git a/src/mem/protocol/GPU_VIPER_Baseline.slicc b/src/mem/protocol/GPU_VIPER_Baseline.slicc new file mode 100644 index 000000000..49bdce38c --- /dev/null +++ b/src/mem/protocol/GPU_VIPER_Baseline.slicc @@ -0,0 +1,9 @@ +protocol "GPU_VIPER"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-probeFilter.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_VIPER-TCP.sm"; +include "GPU_VIPER-SQC.sm"; +include "GPU_VIPER-TCC.sm"; +include "MOESI_AMD_Base-L3cache.sm"; diff --git a/src/mem/protocol/GPU_VIPER_Region-TCC.sm b/src/mem/protocol/GPU_VIPER_Region-TCC.sm new file mode 100644 index 000000000..c3aef15a3 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER_Region-TCC.sm @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor, Blake Hechtman + */ + +/* + * This file is inherited from GPU_VIPER-TCC.sm and retains its structure. + * There are very few modifications in this file from the original VIPER TCC + */ + +machine(MachineType:TCC, "TCC Cache") + : CacheMemory * L2cache; + bool WB; /*is this cache Writeback?*/ + int regionBufferNum; + Cycles l2_request_latency := 50; + Cycles l2_response_latency := 20; + + // From the TCPs or SQCs + MessageBuffer * requestFromTCP, network="From", virtual_network="1", ordered="true", vnet_type="request"; + // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC. + MessageBuffer * responseToCore, network="To", virtual_network="3", ordered="true", vnet_type="response"; + // From the NB + MessageBuffer * probeFromNB, network="From", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseFromNB, network="From", virtual_network="2", ordered="false", vnet_type="response"; + // To the NB + MessageBuffer * requestToNB, network="To", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseToNB, network="To", virtual_network="2", ordered="false", vnet_type="response"; + MessageBuffer * unblockToNB, network="To", virtual_network="4", ordered="false", vnet_type="unblock"; + + MessageBuffer * triggerQueue, ordered="true", random="false"; +{ + // EVENTS + enumeration(Event, desc="TCC Events") { + // Requests coming from the Cores + RdBlk, desc="RdBlk event"; + WrVicBlk, desc="L1 Write Through"; + WrVicBlkBack, desc="L1 Write Back(dirty cache)"; + Atomic, desc="Atomic Op"; + AtomicDone, desc="AtomicOps Complete"; + AtomicNotDone, desc="AtomicOps not Complete"; + Data, desc="data messgae"; + // Coming from this TCC + L2_Repl, desc="L2 Replacement"; + // Probes + PrbInv, desc="Invalidating probe"; + // Coming from Memory Controller + WBAck, desc="writethrough ack from memory"; + } + + // STATES + state_declaration(State, desc="TCC State", default="TCC_State_I") { + M, AccessPermission:Read_Write, desc="Modified(dirty cache only)"; + W, AccessPermission:Read_Write, desc="Written(dirty cache only)"; + V, AccessPermission:Read_Only, desc="Valid"; + I, AccessPermission:Invalid, desc="Invalid"; + IV, AccessPermission:Busy, desc="Waiting for Data"; + WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack"; + A, AccessPermission:Busy, desc="Invalid waiting on atomic Data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + WriteMask writeMask, desc="Dirty byte mask"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + NetDest Destination, desc="Data destination"; + int numAtomics, desc="number remaining atomics"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + MachineID getPeer(MachineID mach) { + return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum)); + } + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCC_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead,addr); + } else if (request_type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite,addr); + } else if (request_type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead,addr); + } else if (request_type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite,addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + + // ** OUT_PORTS ** + + // Three classes of ports + // Class 1: downward facing network links to NB + out_port(requestToNB_out, CPURequestMsg, requestToNB); + out_port(responseToNB_out, ResponseMsg, responseToNB); + out_port(unblockToNB_out, UnblockMsg, unblockToNB); + + // Class 2: upward facing ports to GPU cores + out_port(responseToCore_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + // + // request queue going to NB + // + + +// ** IN_PORTS ** + in_port(triggerQueue_in, TiggerMsg, triggerQueue) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (tbe.numAtomics == 0) { + trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + + in_port(responseFromNB_in, ResponseMsg, responseFromNB) { + if (responseFromNB_in.isReady(clockEdge())) { + peek(responseFromNB_in, ResponseMsg, block_on="addr") { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:Data, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Finally handling incoming requests (from TCP) and probes (from NB). + + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + DPRINTF(RubySlicc, "%s\n", in_msg); + DPRINTF(RubySlicc, "machineID: %s\n", machineID); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } + } + + + in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) { + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + if(WB) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Unexpected Response Message to Core"); + } + } + } + } + // BEGIN ACTIONS + + action(i_invL2, "i", desc="invalidate TCC cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + // Data available at TCC. Send the DATA to TCP + action(sd_sendData, "sd", desc="send Shared response") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + // Data was not available at TCC. So, TCC forwarded the request to + // directory and directory responded back with data. Now, forward the + // DATA to TCP and send the unblock ack back to directory. + action(sdr_sendDataResponse, "sdr", desc="send Shared response") { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination := tbe.Destination; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + + action(rd_requestData, "r", desc="Miss in L2, pass on") { + if(tbe.Destination.count()==1){ + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(swb_sendWBAck, "swb", desc="send WB Ack") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.DataBlk := in_msg.DataBlk; + } + } + } + action(sd2rb_sendDone2RegionBuffer, "sd2rb", desc="Request finished, send done ack") { + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(getPeer(machineID)); + out_msg.DoneAck := true; + out_msg.MessageSize := MessageSizeType:Unblock_Control; + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := false; + } + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(a_allocateBlock, "a", desc="allocate TCC block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + cache_entry.writeMask.clear(); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + if (is_invalid(tbe)) { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.Destination.clear(); + tbe.numAtomics := 0; + } + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){ + tbe.Destination.add(in_msg.Requestor); + } + } + } + } + + action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { + tbe.Destination.clear(); + TBEs.deallocate(address); + unset_tbe(); + } + + action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") { + peek(responseFromNB_in, ResponseMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); + cache_entry.writeMask.orMask(in_msg.writeMask); + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wt_writeThrough, "wt", desc="write through data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(wb_writeBack, "wb", desc="write back data") { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.orMask(cache_entry.writeMask); + } + } + + action(at_atomicThrough, "at", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:Atomic; + out_msg.Dirty := true; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L2cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + coreRequestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseFromNB_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + action(zz_recycleRequestQueue, "z", desc="stall"){ + coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + + action(ina_incrementNumAtomics, "ina", desc="inc num atomics") { + tbe.numAtomics := tbe.numAtomics + 1; + } + + + action(dna_decrementNumAtomics, "dna", desc="dec num atomics") { + tbe.numAtomics := tbe.numAtomics - 1; + if (tbe.numAtomics==0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AtomicDone; + } + } + } + + action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") { + triggerQueue_in.dequeue(clockEdge()); + } + + // END ACTIONS + + // BEGIN TRANSITIONS + // transitions from base + // Assumptions for ArrayRead/Write + // TBE checked before tags + // Data Read/Write requires Tag Read + + transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} { + zz_recycleRequestQueue; + } + transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) {TagArrayRead} { + zz_recycleRequestQueue; + } + transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} { + zz_recycleRequestQueue; + } + transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} { + sd_sendData; + ut_updateTag; + p_popRequestQueue; + } + transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + transition(I, RdBlk, IV) {TagArrayRead} { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition(IV, RdBlk) { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition({V, I},Atomic, A) {TagArrayRead} { + i_invL2; + t_allocateTBE; + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition(A, Atomic) { + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition({M, W}, Atomic, WI) {TagArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + // Cahceblock stays in I state which implies + // this TCC is a write-no-allocate cache + transition(I, WrVicBlk) {TagArrayRead} { + wt_writeThrough; + p_popRequestQueue; + } + + transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} { + ut_updateTag; + wdb_writeDirtyBytes; + wt_writeThrough; + p_popRequestQueue; + } + + transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + i_invL2; + } + + transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} { + i_invL2; + } + + transition({A, IV, WI}, L2_Repl) { + i_invL2; + } + + transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(W, PrbInv) {TagArrayRead} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({A, IV, WI}, PrbInv) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + wcb_writeCacheBlock; + sdr_sendDataResponse; + sd2rb_sendDone2RegionBuffer; + pr_popResponseQueue; + dt_deallocateTBE; + } + + transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ar_sendAtomicResponse; + sd2rb_sendDone2RegionBuffer; + dna_decrementNumAtomics; + pr_popResponseQueue; + } + + transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} { + dt_deallocateTBE; + ptr_popTriggerQueue; + } + + transition(A, AtomicNotDone) {TagArrayRead} { + ptr_popTriggerQueue; + } + + //M,W should not see WBAck as the cache is in WB mode + //WBAcks do not need to check tags + transition({I, V, IV, A}, WBAck) { + w_sendResponseWBAck; + sd2rb_sendDone2RegionBuffer; + pr_popResponseQueue; + } + + transition(WI, WBAck,I) { + sd2rb_sendDone2RegionBuffer; + dt_deallocateTBE; + pr_popResponseQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER_Region.slicc b/src/mem/protocol/GPU_VIPER_Region.slicc new file mode 100644 index 000000000..cbfef9de3 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER_Region.slicc @@ -0,0 +1,11 @@ +protocol "GPU_VIPER_Region"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-Region-CorePair.sm"; +include "MOESI_AMD_Base-L3cache.sm"; +include "MOESI_AMD_Base-Region-dir.sm"; +include "GPU_VIPER_Region-TCC.sm"; +include "GPU_VIPER-TCP.sm"; +include "GPU_VIPER-SQC.sm"; +include "MOESI_AMD_Base-RegionDir.sm"; +include "MOESI_AMD_Base-RegionBuffer.sm"; diff --git a/src/mem/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm new file mode 100644 index 000000000..76fe77230 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm @@ -0,0 +1,2904 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:CorePair, "CP-like Core Coherence") + : Sequencer * sequencer; + Sequencer * sequencer1; + CacheMemory * L1Icache; + CacheMemory * L1D0cache; + CacheMemory * L1D1cache; + CacheMemory * L2cache; // func mem logic looks in this CacheMemory + bool send_evictions := "False"; + Cycles issue_latency := 5; // time to send data down to NB + Cycles l2_hit_latency := 18; + + // BEGIN Core Buffers + + // To the Network + MessageBuffer * requestFromCore, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCore, network="To", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="4", vnet_type="unblock"; + + // From the Network + MessageBuffer * probeToCore, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToCore, network="From", virtual_network="2", vnet_type="response"; + + MessageBuffer * mandatoryQueue; + + MessageBuffer * triggerQueue, ordered="true"; + + // END Core Buffers + +{ + // BEGIN STATES + state_declaration(State, desc="Cache states", default="CorePair_State_I") { + + // Base States + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership"; + E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership"; + Es, AccessPermission:Read_Write, desc="Exclusive in core"; + O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line"; + Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line"; + M0, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + M1, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + + // Transient States + I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well"; + I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well"; + I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well"; + I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well"; + I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters"; + + IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive"; + IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive"; + IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills"; + IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received"; + F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received"; + + ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack"; + MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack"; + MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + S_F0, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F1, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F, AccessPermission:Read_Only, desc="Shared, filling L1"; + O_F0, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F1, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F, AccessPermission:Read_Only, desc="Owned, filling L1"; + Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache"; + Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache"; + S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response"; + S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response"; + + Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F, AccessPermission:Read_Write, desc="Es, other cluster read, filling"; + E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling"; + E1_F, AccessPermission:Read_Write, desc="..."; + E0_Es, AccessPermission:Read_Write, desc="..."; + E1_Es, AccessPermission:Read_Write, desc="..."; + Ms_F0, AccessPermission:Read_Write, desc="..."; + Ms_F1, AccessPermission:Read_Write, desc="..."; + Ms_F, AccessPermission:Read_Write, desc="..."; + M0_F, AccessPermission:Read_Write, desc="..."; + M0_Ms, AccessPermission:Read_Write, desc="..."; + M1_F, AccessPermission:Read_Write, desc="..."; + M1_Ms, AccessPermission:Read_Write, desc="..."; + + I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback"; + S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck"; + + } // END STATES + + // BEGIN EVENTS + enumeration(Event, desc="CP Events") { + // CP Initiated events + C0_Load_L1miss, desc="Cluster 0 load, L1 missed"; + C0_Load_L1hit, desc="Cluster 0 load, L1 hit"; + C1_Load_L1miss, desc="Cluster 1 load L1 missed"; + C1_Load_L1hit, desc="Cluster 1 load L1 hit"; + Ifetch0_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch1_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch0_L1miss, desc="Instruction fetch, missed in the L1"; + Ifetch1_L1miss, desc="Instruction fetch, missed in the L1"; + C0_Store_L1miss, desc="Cluster 0 store missed in L1"; + C0_Store_L1hit, desc="Cluster 0 store hit in L1"; + C1_Store_L1miss, desc="Cluster 1 store missed in L1"; + C1_Store_L1hit, desc="Cluster 1 store hit in L1"; + // NB Initiated events + NB_AckS, desc="NB Ack to Core Request"; + NB_AckM, desc="NB Ack to Core Request"; + NB_AckE, desc="NB Ack to Core Request"; + + NB_AckWB, desc="NB Ack for writeback"; + + // Memory System initiatied events + L1I_Repl, desc="Replace address from L1I"; // Presumed clean + L1D0_Repl, desc="Replace address from L1D0"; // Presumed clean + L1D1_Repl, desc="Replace address from L1D1"; // Presumed clean + L2_Repl, desc="Replace address from L2"; + + L2_to_L1D0, desc="L1 fill from L2"; + L2_to_L1D1, desc="L1 fill from L2"; + L2_to_L1I, desc="L1 fill from L2"; + + // Probe Events + PrbInvData, desc="probe, return O or M data"; + PrbInv, desc="probe, no need for data"; + PrbShrData, desc="probe downgrade, return O or M data"; + + } // END EVENTS + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L1D0DataArrayRead, desc="Read the data array"; + L1D0DataArrayWrite, desc="Write the data array"; + L1D0TagArrayRead, desc="Read the data array"; + L1D0TagArrayWrite, desc="Write the data array"; + L1D1DataArrayRead, desc="Read the data array"; + L1D1DataArrayWrite, desc="Write the data array"; + L1D1TagArrayRead, desc="Read the data array"; + L1D1TagArrayWrite, desc="Write the data array"; + L1IDataArrayRead, desc="Read the data array"; + L1IDataArrayWrite, desc="Write the data array"; + L1ITagArrayRead, desc="Read the data array"; + L1ITagArrayWrite, desc="Write the data array"; + L2DataArrayRead, desc="Read the data array"; + L2DataArrayWrite, desc="Write the data array"; + L2TagArrayRead, desc="Read the data array"; + L2TagArrayWrite, desc="Write the data array"; + } + + + // BEGIN STRUCTURE DEFINITIONS + + + // Cache Entry + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // END STRUCTURE DEFINITIONS + + // BEGIN INTERNAL FUNCTIONS + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + bool addressInCore(Addr addr) { + return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr)); + } + + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address)); + return L2cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" { + if (cluster == 0) { + Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr)); + return L1D0_entry; + } else { + Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr)); + return L1D1_entry; + } + } + + Entry getICacheEntry(Addr addr), return_by_pointer="yes" { + Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr)); + return c_entry; + } + + bool presentOrAvail2(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + bool presentOrAvailI(Addr addr) { + return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr); + } + + bool presentOrAvailD0(Addr addr) { + return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr); + } + + bool presentOrAvailD1(Addr addr) { + return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return CorePair_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return CorePair_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(CorePair_State_to_permission(state)); + } + } + + MachineType testAndClearLocalHit(Entry cache_entry) { + assert(is_valid(cache_entry)); + if (cache_entry.FromL2) { + cache_entry.FromL2 := false; + return MachineType:L2Cache; + } else { + return MachineType:L1Cache; + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L1D0DataArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L2DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L2DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0DataArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + + } else { + return true; + } + } + + // END INTERNAL FUNCTIONS + + // ** OUT_PORTS ** + + out_port(requestNetwork_out, CPURequestMsg, requestFromCore); + out_port(responseNetwork_out, ResponseMsg, responseFromCore); + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // ** IN_PORTS ** + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == TriggerType:L2_to_L1) { + if (in_msg.Dest == CacheId:L1I) { + trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D0) { + trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D1) { + trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe); + } else { + error("unexpected trigger dest"); + } + } + } + } + } + + + in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + // ResponseNetwork + in_port(responseToCore_in, ResponseMsg, responseToCore) { + if (responseToCore_in.isReady(clockEdge())) { + peek(responseToCore_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.State == CoherenceState:Modified) { + trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Nothing from the Unblock Network + + // Mandatory Queue + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + if (in_msg.Type == RubyRequestType:IFETCH) { + // FETCH ACCESS + + if (L1Icache.isTagPresent(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe); + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailI(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } else { + trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } + } else { + Addr victim := L1Icache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1I_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // Not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + // DATA ACCESS + if (mod(in_msg.contextId, 2) == 1) { + if (L1D1cache.isTagPresent(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + // Stores must write through, make sure L2 avail. + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD1(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C1_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1D1_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } else { + Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0); + if (is_valid(L1D0cache_entry)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD0(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C0_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1D0_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } + } + } + } + } + + + // ACTIONS + action(ii_invIcache, "ii", desc="invalidate iCache") { + if (L1Icache.isTagPresent(address)) { + L1Icache.deallocate(address); + } + } + + action(i0_invCluster, "i0", desc="invalidate cluster 0") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + } + + action(i1_invCluster, "i1", desc="invalidate cluster 1") { + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(ib_invBothClusters, "ib", desc="invalidate both clusters") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(i2_invL2, "i2", desc="invalidate L2") { + if(is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(mru_setMRU, "mru", desc="Update LRU state") { + L2cache.setMRU(address); + } + + action(mruD1_setD1cacheMRU, "mruD1", desc="Update LRU state") { + L1D1cache.setMRU(address); + } + + action(mruD0_setD0cacheMRU, "mruD0", desc="Update LRU state") { + L1D0cache.setMRU(address); + } + + action(mruI_setIcacheMRU, "mruI", desc="Update LRU state") { + L1Icache.setMRU(address); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + DPRINTF(RubySlicc,"%s\n",out_msg.Destination); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(vd_victim, "vd", desc="Victimize M/O L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(vc_victim, "vc", desc="Victimize E/S L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") { + if (L1D0cache.isTagPresent(address) == false) { + L1D0cache.allocateVoid(address, new Entry); + } + } + + action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") { + if (L1D1cache.isTagPresent(address) == false) { + L1D1cache.allocateVoid(address, new Entry); + } + } + + action(ai_allocateL1I, "ai", desc="Allocate L1I Block") { + if (L1Icache.isTagPresent(address) == false) { + L1Icache.allocateVoid(address, new Entry); + } + } + + action(a2_allocateL2, "a2", desc="Allocate L2 Block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToCore_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(il0_loadDone, "il0", desc="Cluster 0 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(il1_loadDone, "il1", desc="Cluster 1 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l0_loadDone, "l0", desc="Cluster 0 load done") { + Entry entry := getL1CacheEntry(address, 0); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l1_loadDone, "l1", desc="Cluster 1 load done") { + Entry entry := getL1CacheEntry(address, 1); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(xl0_loadDone, "xl0", desc="Cluster 0 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", address, l2entry.DataBlk); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xl1_loadDone, "xl1", desc="Cluster 1 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(s0_storeDone, "s0", desc="Cluster 0 store done") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + entry.Dirty := true; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(s1_storeDone, "s1", desc="Cluster 1 store done") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(xs0_storeDone, "xs0", desc="Cluster 0 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(xs1_storeDone, "xs1", desc="Cluster 1 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer.evictionCallback(address); + } + } + + action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer1.evictionCallback(address); + } + } + + action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1I; + } + } + + action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D0; + } + } + + action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D1; + } + } + + action(wi_writeIcache, "wi", desc="write data to icache (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + DPRINTF(ProtocolTrace, "CP writeD0: address %s, data: %s\n", address, in_msg.DataBlk); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + assert(addressInCore(address) || is_valid(tbe)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + if (addressInCore(address)) { + out_msg.Hit := true; + } else { + out_msg.Hit := false; + } + out_msg.Dirty := false; // not sending back data, so def. not dirty + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + assert(tbe.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(l2m_profileMiss, "l2m", desc="l2m miss profile") { + ++L2cache.demand_misses; + } + + action(l10m_profileMiss, "l10m", desc="l10m miss profile") { + ++L1D0cache.demand_misses; + } + + action(l11m_profileMiss, "l11m", desc="l11m miss profile") { + ++L1D1cache.demand_misses; + } + + action(l1im_profileMiss, "l1lm", desc="l1im miss profile") { + ++L1Icache.demand_misses; + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xx_recycleResponseQueue, "xx", desc="recycle response queue") { + responseToCore_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead,L2TagArrayRead} { + // track misses, if implemented + // L2 miss as well + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // L2 miss as well + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + mruD0_setD0cacheMRU; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + mruD1_setD1cacheMRU; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + a0_allocateL1D; + l10m_profileMiss; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L1ITagArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW + transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; // instantaneous L1/L2 dirty - no writethrough delay + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, C1_Load_L1miss, E0_Es) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, C1_Store_L1miss, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + l11m_profileMiss; + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite} { + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + mru_setMRU; + p_popMandatoryQueue; + } + + transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead,L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue CtoD + l10m_profileMiss; + a0_allocateL1D; + mruD0_setD0cacheMRU; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l11m_profileMiss; + a1_allocateL1D; + mruD1_setD1cacheMRU; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss + l1im_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead } { + l2m_profileMiss; // permissions miss + l1im_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D0TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead,L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} { + a0_allocateL1D; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} { + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite} { + a1_allocateL1D; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + // end transitions from base + + // Begin simple hit transitions + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, + Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} { + // track hits, if implemented + l0_loadDone; + mruD0_setD0cacheMRU; + p_popMandatoryQueue; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, + Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} { + // track hits, if implemented + l1_loadDone; + mruD1_setD1cacheMRU; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} { + // track hits, if implemented + il0_loadDone; + mruI_setIcacheMRU; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} { + // track hits, if implemented + il1_loadDone; + mruI_setIcacheMRU; + p_popMandatoryQueue; + } + + // end simple hit transitions + + // Transitions from transient states + + // recycles + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1, + O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0, + O_M0, S0, S1, I_C, S0_C, S1_C, S_C}, C1_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, + IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F, + O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F, + E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C, + S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1, + Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms, + M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0, + Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F, + M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1, + O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, + M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS, + PrbInvData, PrbInv, PrbShrData}) {} { + yy_recycleProbeQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} { + xx_recycleResponseQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Load_L1miss, I_M0Ms) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Load_L1miss, I_M1Ms) {} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Store_L1miss, I_M0M1) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Store_L1miss, I_M1M0) {} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_E0S, C1_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_E1S, C0_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F0, C1_Load_L1miss, S_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) { L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} { + i0_invCluster; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} { + i1_invCluster; + } + + transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} { + ii_invIcache; + } + + transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vc_victim; + ib_invBothClusters; + i2_invL2; + ii_invIcache; + } + + transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vd_victim; + i2_invL2; + ib_invBothClusters; // nothing will happen for D0 on M1, vice versa + } + + transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckS, S_C) {L1D0DataArrayWrite,L2DataArrayWrite} { + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckS, S_C) {L1D1DataArrayWrite, L2DataArrayWrite} { + wi_writeIcache; + xi1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + // THESE MO->M1 should not be instantaneous but oh well for now. + transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + uu_sendUnblock; + i0_invCluster; + s1_storeDone; + pr_popResponseQueue; + } + + transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + uu_sendUnblock; + i1_invCluster; + s0_storeDone; + pr_popResponseQueue; + } + + // Above shoudl be more like this, which has some latency to xfer to L1 + transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} { + w0_writeDcache; + xs0_storeDone; + uu_sendUnblock; + f1_L2ToL1; + pr_popResponseQueue; + } + + transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite, L2DataArrayWrite} { + w1_writeDcache; + xs1_storeDone; + uu_sendUnblock; + f0_L2ToL1; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + mru_setMRU; + il0_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + mru_setMRU; + il1_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(IF_E0S, L2_to_L1D0, I_E0S) {} { + pt_popTriggerQueue; + } + + transition(IF_E1S, L2_to_L1D1, I_E1S) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D0, IF1_ES) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D1, IF0_ES) {} { + pt_popTriggerQueue; + } + + transition(IF0_ES, L2_to_L1D0, I_ES) {} { + pt_popTriggerQueue; + } + + transition(IF1_ES, L2_to_L1D1, I_ES) {} { + pt_popTriggerQueue; + } + + transition(F_S0, L2_to_L1I, S0) {} { + pt_popTriggerQueue; + } + + transition(F_S1, L2_to_L1I, S1) {} { + pt_popTriggerQueue; + } + + transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + mru_setMRU; + xs0_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + mru_setMRU; + xs1_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; // FOO + nS_issueRdBlkS; + pr_popResponseQueue; + } + + transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; // FOO + nS_issueRdBlkS; + pr_popResponseQueue; + } + + // Writeback cancel "ack" + transition(I_C, NB_AckWB, I) {L2TagArrayWrite} { + ss_sendStaleNotification; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} { + ss_sendStaleNotification; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} { + ss_sendStaleNotification; + pr_popResponseQueue; + } + + transition(S_C, NB_AckWB, S) {L2TagArrayWrite} { + ss_sendStaleNotification; + pr_popResponseQueue; + } + + // Begin Probe Transitions + + transition({Ms, M0, M1, O}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + i2_invL2; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S, I}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; // only relevant for S + pp_popProbeQueue; + } + + transition(S_C, PrbInvData, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, PrbInvData, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; // nothing will happen in I + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(S_C, PrbInv, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O}, PrbShrData, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S}, PrbShrData, S) {L2TagArrayRead, L2TagArrayWrite} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_C, PrbShrData) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({I, I_C}, PrbShrData) {L2TagArrayRead} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S}, {PrbInv, PrbInvData}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M0) + a0_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M1, I_E1S}, {PrbInv, PrbInvData}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M1) + a1_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbShrData}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + a0_allocateL1D; + a1_allocateL1D; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S, I_M1, I_E1S}, PrbShrData) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition(ES_I, PrbInvData, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) {} { + pdt_sendProbeResponseDataFromTBE; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, PrbShrData, ES_I) {} { + ph_sendProbeResponseHit; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_I, PrbShrData, MO_I) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_S0, PrbInvData, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(MO_S1, PrbInvData, S1_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition({MO_S0, MO_S1}, PrbShrData) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInv}, IF_E0S) {}{ + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInv}, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F, Es_F}, {PrbInvData, PrbInv}, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition(Si_F0, {PrbInvData, PrbInv}, F_S0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(Si_F1, {PrbInvData, PrbInv}, F_S1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition({Es_F0, E0_F, E1_Es}, PrbShrData, S_F0) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({Es_F1, E1_F, E0_Es}, PrbShrData, S_F1) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(Es_F, PrbShrData, S_F) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, PrbShrData) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_M0, PrbInvData, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M0, PrbInvData, I_M0) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, O_M0}, {PrbInv}, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(S_M1, PrbInvData, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M1, PrbInvData, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M1, O_M1}, {PrbInv}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S0, S0_C}, {PrbInvData, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S1, S1_C}, {PrbInvData, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, S_M1}, PrbShrData) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({O_M0, O_M1}, PrbShrData) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({S0, S1, S0_C, S1_C}, PrbShrData) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInvData, IF_E0S) { L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInvData, IF_E1S) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, PrbInvData, IF_ES) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, PrbInv, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms}, PrbShrData, O_F0) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms}, PrbShrData, O_F1) {} { + } + + transition({Ms_F}, PrbShrData, O_F) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({O_F0, O_F1, O_F}, PrbShrData) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + // END TRANSITIONS +} + + diff --git a/src/mem/protocol/MOESI_AMD_Base-L3cache.sm b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm new file mode 100644 index 000000000..479cf4e78 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:L3Cache, "L3") + : CacheMemory * L3cache; + WireBuffer * reqToDir; + WireBuffer * respToDir; + WireBuffer * l3UnblockToDir; + WireBuffer * reqToL3; + WireBuffer * probeToL3; + WireBuffer * respToL3; + Cycles l3_request_latency := 1; + Cycles l3_response_latency := 35; + + // To the general response network + MessageBuffer * responseFromL3, network="To", virtual_network="2", ordered="false", vnet_type="response"; + + // From the general response network + MessageBuffer * responseToL3, network="From", virtual_network="2", ordered="false", vnet_type="response"; + +{ + // EVENTS + enumeration(Event, desc="L3 Events") { + // Requests coming from the Cores + RdBlk, desc="CPU RdBlk event"; + RdBlkM, desc="CPU RdBlkM event"; + RdBlkS, desc="CPU RdBlkS event"; + CtoD, desc="Change to Dirty request"; + WrVicBlk, desc="L2 Victim (dirty)"; + WrVicBlkShared, desc="L2 Victim (dirty)"; + ClVicBlk, desc="L2 Victim (clean)"; + ClVicBlkShared, desc="L2 Victim (clean)"; + + CPUData, desc="WB data from CPU"; + CPUDataShared, desc="WB data from CPU, NBReqShared 1"; + StaleWB, desc="WB stale; no data"; + + L3_Repl, desc="L3 Replacement"; + + // Probes + PrbInvData, desc="Invalidating probe, return dirty data"; + PrbInv, desc="Invalidating probe, no need to return data"; + PrbShrData, desc="Downgrading probe, return data"; + + // Coming from Memory Controller + WBAck, desc="ack from memory"; + + CancelWB, desc="Cancel WB from L2"; + } + + // STATES + // Base States: + state_declaration(State, desc="L3 State", default="L3Cache_State_I") { + M, AccessPermission:Read_Write, desc="Modified"; // No other cache has copy, memory stale + O, AccessPermission:Read_Only, desc="Owned"; // Correct most recent copy, others may exist in S + E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory) + S, AccessPermission:Read_Only, desc="Shared"; // Correct, most recent. If no one in O, then == Memory + I, AccessPermission:Invalid, desc="Invalid"; + + I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M"; + S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E"; + S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S"; + E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data"; + O_M, AccessPermission:Busy, desc="..."; + O_O, AccessPermission:Busy, desc="..."; + O_E, AccessPermission:Busy, desc="..."; + O_S, AccessPermission:Busy, desc="..."; + M_M, AccessPermission:Busy, desc="..."; + M_O, AccessPermission:Busy, desc="..."; + M_E, AccessPermission:Busy, desc="..."; + M_S, AccessPermission:Busy, desc="..."; + D_I, AccessPermission:Invalid, desc="drop WB data on the floor when receive"; + MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem"; + MO_I, AccessPermission:Busy, desc="M or O, received L3_Repl, waiting for WBAck from Mem"; + I_I, AccessPermission:Busy, desc="I_MO received L3_Repl"; + I_CD, AccessPermission:Busy, desc="I_I received WBAck, now just waiting for CPUData"; + I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L3cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L3cache.isTagPresent(addr) || L3cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return L3Cache_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return L3Cache_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(L3Cache_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + return true; + } + + + // OUT PORTS + out_port(requestNetwork_out, CPURequestMsg, reqToDir); + out_port(L3Resp_out, ResponseMsg, respToDir); + out_port(responseNetwork_out, ResponseMsg, responseFromL3); + out_port(unblockNetwork_out, UnblockMsg, l3UnblockToDir); + + // IN PORTS + in_port(NBResponse_in, ResponseMsg, respToL3) { + if (NBResponse_in.isReady(clockEdge())) { + peek(NBResponse_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on NBResponse Type"); + } + } + } + } + + // Response Network + in_port(responseNetwork_in, ResponseMsg, responseToL3) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUData) { + if (in_msg.NbReqShared) { + trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPUData, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on NBResponse Type"); + } + } + } + } + + // probe network + in_port(probeNetwork_in, NBProbeRequestMsg, probeToL3) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + if (in_msg.ReturnData) { + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } else { + error("Don't think I should get any of these"); + } + } + } + } + } + + // Request Network + in_port(requestNetwork_in, CPURequestMsg, reqToL3) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + assert(in_msg.Destination.isElement(machineID)); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L3cache.cacheProbe(in_msg.addr); + trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L3cache.cacheProbe(in_msg.addr); + trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceRequestType:WrCancel) { + if (is_valid(tbe) && tbe.From == in_msg.Requestor) { + trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe); + } else { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + } + } + } + } + + // BEGIN ACTIONS + + action(i_invL3, "i", desc="invalidate L3 cache block") { + if (is_valid(cache_entry)) { + L3cache.deallocate(address); + } + unset_cache_entry(); + } + + action(rm_sendResponseM, "rm", desc="send Modified response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Modified; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(rs_sendResponseS, "rs", desc="send Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + action(r_requestToMem, "r", desc="Miss in L3, pass on") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (is_valid(cache_entry)) { + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := cache_entry.Dirty; + } + tbe.From := machineID; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(vd_vicDirty, "vd", desc="Victimize dirty L3 data") { + enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + assert(tbe.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.State := CoherenceState:NA; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") { + enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:WrCancel; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(a_allocateBlock, "a", desc="allocate L3 block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L3cache.allocate(address, new Entry)); + } + } + + action(d_writeData, "d", desc="write data to L3") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + cache_entry.Dirty := in_msg.Dirty; + } + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to L3: %s\n", in_msg); + } + } + + action(rd_copyDataFromRequest, "rd", desc="write data to L3") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := true; + } + } + + action(f_setFrom, "f", desc="set who WB is expected to come from") { + peek(requestNetwork_in, CPURequestMsg) { + tbe.From := in_msg.Requestor; + } + } + + action(rf_resetFrom, "rf", desc="reset From") { + tbe.From := machineID; + } + + action(wb_data, "wb", desc="write back data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L3cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pn_popNBResponseQueue, "pn", desc="pop NB response queue") { + NBResponse_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "\z", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + + transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {TagArrayRead} { + r_requestToMem; + p_popRequestQueue; + } + + transition(O, RdBlk ) {TagArrayRead, DataArrayRead} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + transition(M, RdBlk, O) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition(S, RdBlk) {TagArrayRead, DataArrayRead} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + transition(E, RdBlk, S) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition({M, O}, RdBlkS, O) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition({E, S}, RdBlkS, S) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition(M, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + rm_sendResponseM; + i_invL3; + p_popRequestQueue; + } + + transition({O, S}, {RdBlkM, CtoD}) {TagArrayRead} { + r_requestToMem; // can't handle this, just forward + p_popRequestQueue; + } + + transition(E, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + rm_sendResponseM; + i_invL3; + p_popRequestQueue; + } + + transition({I}, WrVicBlk, I_M) {TagArrayRead, TagArrayWrite} { + a_allocateBlock; + t_allocateTBE; + f_setFrom; +// rd_copyDataFromRequest; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {} { + zz_recycleRequestQueue; + } + + transition({I}, WrVicBlkShared, I_O) {TagArrayRead, TagArrayWrite} { + a_allocateBlock; + t_allocateTBE; + f_setFrom; +// rd_copyDataFromRequest; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, WrVicBlkShared, S_O) {TagArrayRead, TagArrayWrite} { +// rd_copyDataFromRequest; + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, WrVicBlk, S_M) {TagArrayRead, TagArrayWrite} { // should be technically not possible, but assume the data comes back with shared bit flipped +// rd_copyDataFromRequest; + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, WrVicBlk, E_M) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, WrVicBlkShared, E_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, WrVicBlk, O_M) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, WrVicBlkShared, O_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, WrVicBlk, M_M) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, WrVicBlkShared, M_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({I}, ClVicBlk, I_E) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({I}, ClVicBlkShared, I_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, ClVicBlk, S_E) {TagArrayRead, TagArrayWrite} { // technically impossible, assume data comes back with shared bit flipped + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, ClVicBlkShared, S_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, ClVicBlk, E_E) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, ClVicBlkShared, E_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, ClVicBlk, O_E) {TagArrayRead, TagArrayWrite} { // technically impossible, but assume data comes back with shared bit flipped + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, ClVicBlkShared, O_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, ClVicBlk, M_E) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, ClVicBlkShared, M_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {} { + r_requestToMem; + p_popRequestQueue; + } + + transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {TagArrayWrite} { + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_M, CPUData, M) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUData, E) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(S_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_E, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_S, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition({D_I}, {CPUData, CPUDataShared}, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite} { + uu_sendUnblock; + rf_resetFrom; + pr_popResponseQueue; + } + + transition(I_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite, DataArrayRead} { + uu_sendUnblock; + wt_writeDataToTBE; + rf_resetFrom; + pr_popResponseQueue; + } + + transition(I_CD, {CPUData, CPUDataShared}, I) {DataArrayRead, TagArrayWrite} { + uu_sendUnblock; + wt_writeDataToTBE; + wb_data; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({M, O}, L3_Repl, MO_I) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + vd_vicDirty; + i_invL3; + } + + transition({E, S,}, L3_Repl, I) {TagArrayRead, TagArrayWrite} { + i_invL3; + } + + transition({I_M, I_O, S_M, S_O, E_M, E_O}, L3_Repl) {} { + zz_recycleRequestQueue; + } + + transition({O_M, O_O, O_E, O_S, M_M, M_O, M_E, M_S}, L3_Repl) {} { + zz_recycleRequestQueue; + } + + transition({I_E, I_S, S_E, S_S, E_E, E_S}, L3_Repl) {} { + zz_recycleRequestQueue; + } + + transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + pd_sendProbeResponseData; + i_invL3; + pp_popProbeQueue; + } + + transition({E, S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + i_invL3; // nothing will happen in I + pp_popProbeQueue; + } + + transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + i_invL3; // nothing will happen in I + pp_popProbeQueue; + } + + transition({M, O}, PrbShrData, O) {TagArrayRead, DataArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({E, S}, PrbShrData, S) {TagArrayRead, TagArrayWrite} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(I, PrbShrData) {TagArrayRead} { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) {TagArrayWrite, DataArrayRead} { + pdt_sendProbeResponseDataFromTBE; + mc_cancelMemWriteback; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) {TagArrayWrite} { + pi_sendProbeResponseInv; + mc_cancelMemWriteback; + pp_popProbeQueue; + } + + transition(MO_I, PrbShrData) {DataArrayRead} { + pdt_sendProbeResponseDataFromTBE; + pp_popProbeQueue; + } + + transition(I_C, {PrbInvData, PrbInv}) {} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(I_C, PrbShrData) {} { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(I_I, {WBAck}, I_CD) {TagArrayWrite} { + pn_popNBResponseQueue; + } + + transition(MOD_I, WBAck, D_I) {DataArrayRead} { + wb_data; + pn_popNBResponseQueue; + } + + transition(MO_I, WBAck, I) {DataArrayRead, TagArrayWrite} { + wb_data; + dt_deallocateTBE; + pn_popNBResponseQueue; + } + + transition(I_C, {WBAck}, I) {TagArrayWrite} { + dt_deallocateTBE; + pn_popNBResponseQueue; + } + + transition({I_M, I_O, I_E, I_S}, CancelWB, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + i_invL3; + p_popRequestQueue; + } + + transition({S_S, S_O, S_M, S_E}, CancelWB, S) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition({E_M, E_O, E_E, E_S}, CancelWB, E) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition({O_M, O_O, O_E, O_S}, CancelWB, O) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition({M_M, M_O, M_E, M_S}, CancelWB, M) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition(D_I, CancelWB, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition(MOD_I, CancelWB, MO_I) {TagArrayWrite} { + uu_sendUnblock; + rf_resetFrom; + p_popRequestQueue; + } + + transition(I_I, CancelWB, I_C) {TagArrayWrite} { + uu_sendUnblock; + rf_resetFrom; + mc_cancelMemWriteback; + p_popRequestQueue; + } + + transition(I_CD, CancelWB, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + mc_cancelMemWriteback; + p_popRequestQueue; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm new file mode 100644 index 000000000..fd84447a2 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm @@ -0,0 +1,3009 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:CorePair, "CP-like Core Coherence") + : Sequencer * sequencer; + Sequencer * sequencer1; + CacheMemory * L1Icache; + CacheMemory * L1D0cache; + CacheMemory * L1D1cache; + CacheMemory * L2cache; + int regionBufferNum; + bool send_evictions := "False"; + Cycles issue_latency := 5; + Cycles l2_hit_latency := 18; + + // BEGIN Core Buffers + + // To the Network + MessageBuffer * requestFromCore, network="To", virtual_network="0", ordered="true", vnet_type="request"; + MessageBuffer * responseFromCore, network="To", virtual_network="2", ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="4", ordered="false", vnet_type="unblock"; + + // From the Network + MessageBuffer * probeToCore, network="From", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseToCore, network="From", virtual_network="2", ordered="false", vnet_type="response"; + + MessageBuffer * mandatoryQueue, ordered="false"; + MessageBuffer * triggerQueue, ordered="true"; + + // END Core Buffers + +{ + // BEGIN STATES + state_declaration(State, desc="Cache states", default="CorePair_State_I") { + + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership"; + E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership"; + Es, AccessPermission:Read_Write, desc="Exclusive in core"; + O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line"; + Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line"; + M0, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + M1, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + + // Transient States + I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well"; + I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well"; + I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well"; + I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well"; + I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters"; + + IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive"; + IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive"; + IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills"; + IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received"; + F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received"; + + ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack"; + MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack"; + MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + S_F0, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F1, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F, AccessPermission:Read_Only, desc="Shared, filling L1"; + O_F0, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F1, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F, AccessPermission:Read_Only, desc="Owned, filling L1"; + Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache"; + Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache"; + S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response"; + S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response"; + + Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F, AccessPermission:Read_Write, desc="Es, other cluster read, filling"; + E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling"; + E1_F, AccessPermission:Read_Write, desc="..."; + E0_Es, AccessPermission:Read_Write, desc="..."; + E1_Es, AccessPermission:Read_Write, desc="..."; + Ms_F0, AccessPermission:Read_Write, desc="..."; + Ms_F1, AccessPermission:Read_Write, desc="..."; + Ms_F, AccessPermission:Read_Write, desc="..."; + M0_F, AccessPermission:Read_Write, desc="..."; + M0_Ms, AccessPermission:Read_Write, desc="..."; + M1_F, AccessPermission:Read_Write, desc="..."; + M1_Ms, AccessPermission:Read_Write, desc="..."; + + I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback"; + S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck"; + + } // END STATES + + // BEGIN EVENTS + enumeration(Event, desc="CP Events") { + // CP Initiated events + C0_Load_L1miss, desc="Cluster 0 load, L1 missed"; + C0_Load_L1hit, desc="Cluster 0 load, L1 hit"; + C1_Load_L1miss, desc="Cluster 1 load L1 missed"; + C1_Load_L1hit, desc="Cluster 1 load L1 hit"; + Ifetch0_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch1_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch0_L1miss, desc="Instruction fetch, missed in the L1"; + Ifetch1_L1miss, desc="Instruction fetch, missed in the L1"; + C0_Store_L1miss, desc="Cluster 0 store missed in L1"; + C0_Store_L1hit, desc="Cluster 0 store hit in L1"; + C1_Store_L1miss, desc="Cluster 1 store missed in L1"; + C1_Store_L1hit, desc="Cluster 1 store hit in L1"; + // NB Initiated events + NB_AckS, desc="NB Ack to Core Request"; + NB_AckM, desc="NB Ack to Core Request"; + NB_AckE, desc="NB Ack to Core Request"; + + NB_AckWB, desc="NB Ack for writeback"; + + // Memory System initiatied events + L1I_Repl, desc="Replace address from L1I"; // Presumed clean + L1D0_Repl, desc="Replace address from L1D0"; // Presumed clean + L1D1_Repl, desc="Replace address from L1D1"; // Presumed clean + L2_Repl, desc="Replace address from L2"; + + L2_to_L1D0, desc="L1 fill from L2"; + L2_to_L1D1, desc="L1 fill from L2"; + L2_to_L1I, desc="L1 fill from L2"; + + // Probe Events + PrbInvData, desc="probe, return O or M data"; + PrbInvDataDemand, desc="probe, return O or M data. Demand request"; + PrbInv, desc="probe, no need for data"; + PrbShrData, desc="probe downgrade, return O or M data"; + PrbShrDataDemand, desc="probe downgrade, return O or M data. Demand request"; + ForceRepl, desc="probe from r-buf. Act as though a repl"; + ForceDowngrade, desc="probe from r-buf. Act as though a repl"; + + } // END EVENTS + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L1D0DataArrayRead, desc="Read the data array"; + L1D0DataArrayWrite, desc="Write the data array"; + L1D0TagArrayRead, desc="Read the data array"; + L1D0TagArrayWrite, desc="Write the data array"; + L1D1DataArrayRead, desc="Read the data array"; + L1D1DataArrayWrite, desc="Write the data array"; + L1D1TagArrayRead, desc="Read the data array"; + L1D1TagArrayWrite, desc="Write the data array"; + L1IDataArrayRead, desc="Read the data array"; + L1IDataArrayWrite, desc="Write the data array"; + L1ITagArrayRead, desc="Read the data array"; + L1ITagArrayWrite, desc="Write the data array"; + L2DataArrayRead, desc="Read the data array"; + L2DataArrayWrite, desc="Write the data array"; + L2TagArrayRead, desc="Read the data array"; + L2TagArrayWrite, desc="Write the data array"; + } + + + // BEGIN STRUCTURE DEFINITIONS + + + // Cache Entry + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + bool AckNeeded, desc="True if need to ack r-dir"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // END STRUCTURE DEFINITIONS + + // BEGIN INTERNAL FUNCTIONS + + MachineID getPeer(MachineID mach) { + return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum)); + } + + bool addressInCore(Addr addr) { + return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr)); + } + + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address)); + return L2cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" { + if (cluster == 0) { + Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr)); + return L1D0_entry; + } else { + Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr)); + return L1D1_entry; + } + } + + Entry getICacheEntry(Addr addr), return_by_pointer="yes" { + Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr)); + return c_entry; + } + + bool presentOrAvail2(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + bool presentOrAvailI(Addr addr) { + return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr); + } + + bool presentOrAvailD0(Addr addr) { + return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr); + } + + bool presentOrAvailD1(Addr addr) { + return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return CorePair_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return CorePair_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + bool isValid(Addr addr) { + AccessPermission perm := getAccessPermission(addr); + if (perm == AccessPermission:NotPresent || + perm == AccessPermission:Invalid || + perm == AccessPermission:Busy) { + return false; + } else { + return true; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(CorePair_State_to_permission(state)); + } + } + + MachineType testAndClearLocalHit(Entry cache_entry) { + assert(is_valid(cache_entry)); + if (cache_entry.FromL2) { + cache_entry.FromL2 := false; + return MachineType:L2Cache; + } else { + return MachineType:L1Cache; + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L1D0DataArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L2DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L2DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0DataArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + return true; + } + } + + // END INTERNAL FUNCTIONS + + // ** OUT_PORTS ** + + out_port(requestNetwork_out, CPURequestMsg, requestFromCore); + out_port(responseNetwork_out, ResponseMsg, responseFromCore); + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // ** IN_PORTS ** + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == TriggerType:L2_to_L1) { + if (in_msg.Dest == CacheId:L1I) { + trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D0) { + trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D1) { + trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe); + } else { + error("unexpected trigger dest"); + } + } + } + } + } + + + in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.DemandRequest) { + trigger(Event:PrbInvDataDemand, in_msg.addr, cache_entry, tbe); + } else if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + if (in_msg.DemandRequest) { + trigger(Event:PrbShrDataDemand, in_msg.addr, cache_entry, tbe); + } else { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbRepl) { + trigger(Event:ForceRepl, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == ProbeRequestType:PrbRegDowngrade) { + trigger(Event:ForceDowngrade, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown probe request"); + } + } + } + } + + + // ResponseNetwork + in_port(responseToCore_in, ResponseMsg, responseToCore) { + if (responseToCore_in.isReady(clockEdge())) { + peek(responseToCore_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.State == CoherenceState:Modified) { + trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Nothing from the Unblock Network + + // Mandatory Queue + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + if (in_msg.Type == RubyRequestType:IFETCH) { + // FETCH ACCESS + + if (L1Icache.isTagPresent(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe); + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailI(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } else { + trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } + } else { + Addr victim := L1Icache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1I_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // Not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(0) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + // DATA ACCESS + if (mod(in_msg.contextId, 2) == 1) { + if (L1D1cache.isTagPresent(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + // Stores must write through, make sure L2 avail. + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(1) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD1(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C1_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L1D1_Repl is %s\n", in_msg.LineAddress, victim); + trigger(Event:L1D1_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(2) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } else { + Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0); + if (is_valid(L1D0cache_entry)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(3) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD0(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C0_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L1D0_Repl is %s\n", in_msg.LineAddress, victim); + trigger(Event:L1D0_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(4) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } + } + } + } + } + + + // ACTIONS + action(ii_invIcache, "ii", desc="invalidate iCache") { + if (L1Icache.isTagPresent(address)) { + L1Icache.deallocate(address); + } + } + + action(i0_invCluster, "i0", desc="invalidate cluster 0") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + } + + action(i1_invCluster, "i1", desc="invalidate cluster 1") { + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(ib_invBothClusters, "ib", desc="invalidate both clusters") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(i2_invL2, "i2", desc="invalidate L2") { + if(is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nMs_issueRdBlkMSinked, "nMs", desc="Issue RdBlkM with CtoDSinked") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.CtoDSinked := true; + } + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nSs_issueRdBlkSSinked, "nSs", desc="Issue RdBlkS with CtoDSinked") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.CtoDSinked := true; + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(vd_victim, "vd", desc="Victimize M/O L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(vc_victim, "vc", desc="Victimize E/S L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + // Could send these two directly to dir if we made a new out network on channel 0 + action(vdf_victimForce, "vdf", desc="Victimize M/O L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Private := true; + } + } + + action(vcf_victimForce, "vcf", desc="Victimize E/S L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Private := true; + } + } + + action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") { + if (L1D0cache.isTagPresent(address) == false) { + L1D0cache.allocateVoid(address, new Entry); + } + } + + action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") { + if (L1D1cache.isTagPresent(address) == false) { + L1D1cache.allocateVoid(address, new Entry); + } + } + + action(ai_allocateL1I, "ai", desc="Allocate L1I Block") { + if (L1Icache.isTagPresent(address) == false) { + L1Icache.allocateVoid(address, new Entry); + } + } + + action(a2_allocateL2, "a2", desc="Allocate L2 Block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToCore_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(il0_loadDone, "il0", desc="Cluster 0 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(il1_loadDone, "il1", desc="Cluster 1 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l0_loadDone, "l0", desc="Cluster 0 load done") { + Entry entry := getL1CacheEntry(address, 0); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l1_loadDone, "l1", desc="Cluster 1 load done") { + Entry entry := getL1CacheEntry(address, 1); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(xl0_loadDone, "xl0", desc="Cluster 0 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", + address, l2entry.DataBlk); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xl1_loadDone, "xl1", desc="Cluster 1 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(s0_storeDone, "s0", desc="Cluster 0 store done") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + entry.Dirty := true; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(s1_storeDone, "s1", desc="Cluster 1 store done") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(xs0_storeDone, "xs0", desc="Cluster 0 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(xs1_storeDone, "xs1", desc="Cluster 1 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer.evictionCallback(address); + } + } + + action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer1.evictionCallback(address); + } + } + + action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1I; + } + } + + action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D0; + } + } + + action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D1; + } + } + + action(wi_writeIcache, "wi", desc="write data to icache (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + APPEND_TRANSITION_COMMENT("Setting Ms"); + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + assert(addressInCore(address) || is_valid(tbe)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + if (addressInCore(address)) { + out_msg.Hit := true; + } else { + out_msg.Hit := false; + } + out_msg.Dirty := false; // not sending back data, so def. not dirty + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + APPEND_TRANSITION_COMMENT("Setting Ms"); + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + assert(tbe.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + } + } + + action(ra_sendReplAck, "ra", desc="Send ack to r-buf that line is replaced if needed") { + if (is_invalid(tbe) || tbe.AckNeeded) { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:InvAck; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + APPEND_TRANSITION_COMMENT(" Sending ack to r-buf "); + } else { + APPEND_TRANSITION_COMMENT(" NOT Sending ack to r-buf "); + } + } + + action(m_markAckNeeded, "m", desc="Mark TBE to send ack when deallocated") { + assert(is_valid(tbe)); + tbe.AckNeeded := true; + } + + action(mc_cancelWB, "mc", desc="send writeback cancel to L3") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUCancelWB; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.wasValid := isValid(address); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sdv_sendDoneValid, "sdv", desc="Request finished, send done ack") { + enqueue(unblockNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(getPeer(machineID)); + out_msg.DoneAck := true; + out_msg.MessageSize := MessageSizeType:Unblock_Control; + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else if (is_valid(cache_entry)) { + out_msg.Dirty := cache_entry.Dirty; + } else { + out_msg.Dirty := false; + } + out_msg.validToInvalid := false; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sdi_sendDoneInvalid, "sdi", desc="Request finished, send done ack") { + enqueue(unblockNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(getPeer(machineID)); + out_msg.DoneAck := true; + out_msg.MessageSize := MessageSizeType:Unblock_Control; + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else if (is_valid(cache_entry)) { + out_msg.Dirty := cache_entry.Dirty; + } else { + out_msg.Dirty := false; + } + out_msg.validToInvalid := true; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(l10m_profileMiss, "l10m", desc="l10m miss profile") { + ++L1D0cache.demand_misses; + } + + action(l11m_profileMiss, "l11m", desc="l11m miss profile") { + ++L1D1cache.demand_misses; + } + + action(l1im_profileMiss, "l1lm", desc="l1im miss profile") { + ++L1Icache.demand_misses; + } + + action(l2m_profileMiss, "l2m", desc="l2m miss profile") { + ++L2cache.demand_misses; + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + l1im_profileMiss; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // L2 miss as well + l10m_profileMiss; + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + l11m_profileMiss; + // track misses, if implemented + // L2 miss as well + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead,L2TagArrayRead} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + p_popMandatoryQueue; + } + + transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + p_popMandatoryQueue; + } + + transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead}{ + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead,L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW + transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayWrite,L1D0TagArrayRead, L2TagArrayRead, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; // instantaneous L1/L2 dirty - no writethrough delay + p_popMandatoryQueue; + } + + transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(E0, C1_Load_L1miss, E0_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead } { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(E0, C1_Store_L1miss, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + a1_allocateL1D; + l11m_profileMiss; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + a1_allocateL1D; + l11m_profileMiss; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + a0_allocateL1D; + l10m_profileMiss; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead,L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + p_popMandatoryQueue; + } + + transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue CtoD + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l11m_profileMiss; + a1_allocateL1D; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss + l1im_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead L2TagArrayRead } { + l2m_profileMiss; // permissions miss + l10m_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} { + a0_allocateL1D; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite} { + a1_allocateL1D; + s1_storeDone; + p_popMandatoryQueue; + } + + // end transitions from base + + // Begin simple hit transitions + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, + Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} { + // track hits, if implemented + l0_loadDone; + p_popMandatoryQueue; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, + Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} { + // track hits, if implemented + l1_loadDone; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} { + // track hits, if implemented + il0_loadDone; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} { + // track hits, if implemented + il1_loadDone; + p_popMandatoryQueue; + } + + // end simple hit transitions + + // Transitions from transient states + + // recycles + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1, + O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0, + O_M0, S0, S1, I_C, S0_C, S1_C, S_C}, C1_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, + IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F, + O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F, + E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C, + S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1, + Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms, + M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0, + Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F, + M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1, + O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, + M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS, + PrbInvData, PrbInvDataDemand, PrbInv, PrbShrData, PrbShrDataDemand}) {} { + zz_recycleMandatoryQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} { + zz_recycleMandatoryQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Load_L1miss, I_M0Ms){ + l11m_profileMiss; + l2m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Load_L1miss, I_M1Ms){ + l10m_profileMiss; + l2m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Store_L1miss, I_M0M1) { + l11m_profileMiss; + l2m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Store_L1miss, I_M1M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + l2m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_E0S, C1_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_E1S, C0_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l10m_profileMiss; + l2m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F0, C1_Load_L1miss, S_F) { L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} { + i0_invCluster; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} { + i1_invCluster; + } + + transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} { + ii_invIcache; + } + + transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead,L1D0TagArrayRead, L1D1TagArrayRead, L1ITagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vc_victim; + ib_invBothClusters; + i2_invL2; + ii_invIcache; + } + + transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayRead, L1D1TagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vd_victim; + i2_invL2; + ib_invBothClusters; // nothing will happen for D0 on M1, vice versa + } + + transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckS, S_C) { L1IDataArrayWrite,L2DataArrayWrite} { + // does not need send done since the rdblks was "sinked" + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckS, S_C) { L1D1DataArrayWrite,L2DataArrayWrite} { + wi_writeIcache; + xi1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M0, NB_AckM, M0) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + // THESE MO->M1 should not be instantaneous but oh well for now. + transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + i0_invCluster; + s1_storeDone; + pr_popResponseQueue; + } + + transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + i1_invCluster; + s0_storeDone; + pr_popResponseQueue; + } + + // Above shoudl be more like this, which has some latency to xfer to L1 + transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} { + w0_writeDcache; + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + f1_L2ToL1; + pr_popResponseQueue; + } + + transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite,L2DataArrayWrite} { + w1_writeDcache; + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + f0_L2ToL1; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + il0_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + il1_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(IF_E0S, L2_to_L1D0, I_E0S) {} { + pt_popTriggerQueue; + } + + transition(IF_E1S, L2_to_L1D1, I_E1S) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D0, IF1_ES) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D1, IF0_ES) {} { + pt_popTriggerQueue; + } + + transition(IF0_ES, L2_to_L1D0, I_ES) {} { + pt_popTriggerQueue; + } + + transition(IF1_ES, L2_to_L1D1, I_ES) {} { + pt_popTriggerQueue; + } + + transition(F_S0, L2_to_L1I, S0) {} { + pt_popTriggerQueue; + } + + transition(F_S1, L2_to_L1I, S1) {} { + pt_popTriggerQueue; + } + + transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + ra_sendReplAck; + sdi_sendDoneInvalid; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + ra_sendReplAck; + sdi_sendDoneInvalid; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + sdv_sendDoneValid; + nS_issueRdBlkS; + d_deallocateTBE; // FOO + pr_popResponseQueue; + } + + transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + sdv_sendDoneValid; + nS_issueRdBlkS; + d_deallocateTBE; // FOO + pr_popResponseQueue; + } + + // Writeback cancel "ack" + transition(I_C, NB_AckWB, I) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdi_sendDoneInvalid; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + transition(S_C, NB_AckWB, S) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + // Begin Probe Transitions + + transition({Ms, M0, M1, O}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + i2_invL2; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S, I}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; // only relevant for S + pp_popProbeQueue; + } + + transition(S_C, {PrbInvData, PrbInvDataDemand}, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, {PrbInvData, PrbInvDataDemand}, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; // nothing will happen in I + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(S_C, PrbInv, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O}, {PrbShrData, PrbShrDataDemand}, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S}, {PrbShrData, PrbShrDataDemand}, S) {L2TagArrayRead, L2TagArrayWrite} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_C, {PrbShrData, PrbShrDataDemand}) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({I, I_C}, {PrbShrData, PrbShrDataDemand}) {L2TagArrayRead} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M0) + a0_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M1, I_E1S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M1) + a1_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbInvDataDemand, PrbShrData, PrbShrDataDemand}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + a0_allocateL1D; + a1_allocateL1D; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S, I_M1, I_E1S}, {PrbShrData, PrbShrDataDemand}) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition(ES_I, {PrbInvData, PrbInvDataDemand}, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, {PrbInvData, PrbInvDataDemand}, I_C) {} { + pdt_sendProbeResponseDataFromTBE; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, {PrbShrData, PrbShrDataDemand}, ES_I) {} { + ph_sendProbeResponseHit; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_I, {PrbShrData, PrbShrDataDemand}, MO_I) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_S0, {PrbInvData, PrbInvDataDemand}, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition(MO_S1, {PrbInvData, PrbInvDataDemand}, S1_C) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition({MO_S0, MO_S1}, {PrbShrData, PrbShrDataDemand}) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E0S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F, Es_F}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition(Si_F0, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(Si_F1, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition({Es_F0, E0_F, E1_Es}, {PrbShrData, PrbShrDataDemand}, S_F0) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({Es_F1, E1_F, E0_Es}, {PrbShrData, PrbShrDataDemand}, S_F1) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(Es_F, {PrbShrData, PrbShrDataDemand}, S_F) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, {PrbShrData, PrbShrDataDemand}) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, O_M0}, {PrbInv}, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(S_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M1, O_M1}, {PrbInv}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S0, S0_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S1, S1_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, S_M1}, {PrbShrData, PrbShrDataDemand}) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({O_M0, O_M1}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({S0, S1, S0_C, S1_C}, {PrbShrData, PrbShrDataDemand}) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, {PrbInvData, PrbInvDataDemand}, IF_E0S) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, {PrbInvData, PrbInvDataDemand}, IF_E1S) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, {PrbInvData, PrbInvDataDemand}, IF_ES) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, PrbInv, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms}, {PrbShrData, PrbShrDataDemand}, O_F0) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms}, {PrbShrData, PrbShrDataDemand}, O_F1) {} { + } + + transition({Ms_F}, {PrbShrData, PrbShrDataDemand}, O_F) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({O_F0, O_F1, O_F}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + // END TRANSITIONS +} + + diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm new file mode 100644 index 000000000..52d87fb8b --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm @@ -0,0 +1,2038 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:Directory, "AMD_Base-like protocol") +: DirectoryMemory * directory; + CacheMemory * L3CacheMemory; + Cycles response_latency := 5; + Cycles response_latency_regionDir := 1; + Cycles l3_hit_latency := 30; + bool useL3OnWT := "False"; + Cycles to_memory_controller_latency := 1; + + // From the Cores + MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock"; + + // To the Cores + MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response"; + + // From region buffer + MessageBuffer * reqFromRegBuf, network="From", virtual_network="7", vnet_type="request"; + + // To Region directory + MessageBuffer * reqToRegDir, network="To", virtual_network="5", vnet_type="request"; + MessageBuffer * reqFromRegDir, network="From", virtual_network="5", vnet_type="request"; + MessageBuffer * unblockToRegDir, network="To", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * triggerQueue; + MessageBuffer * L3triggerQueue; + MessageBuffer * responseFromMemory; +{ + // STATES + state_declaration(State, desc="Directory states", default="Directory_State_U") { + U, AccessPermission:Backing_Store, desc="unblocked"; + BR, AccessPermission:Backing_Store, desc="got CPU read request, blocked while sent to L3"; + BW, AccessPermission:Backing_Store, desc="got CPU write request, blocked while sent to L3"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + // BL is Busy because it's possible for the data only to be in the network + // in the WB, L3 has sent it and gone on with its business in possibly I + // state. + BI, AccessPermission:Backing_Store, desc="Blocked waiting for inv ack from core"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + + // These are needed for when a private requests was issued before an inv was received + // for writebacks + BS_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BP_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + // for reads + BS_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BP_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + } + + // Events + enumeration(Event, desc="Directory events") { + // CPU requests + RdBlkS, desc="..."; + RdBlkM, desc="..."; + RdBlk, desc="..."; + WriteThrough, desc="WriteThrough Message"; + Atomic, desc="Atomic Message"; + + RdBlkSP, desc="..."; + RdBlkMP, desc="..."; + RdBlkP, desc="..."; + VicDirtyP, desc="..."; + VicCleanP, desc="..."; + WriteThroughP, desc="WriteThrough Message"; + AtomicP, desc="Atomic Message"; + + // writebacks + VicDirty, desc="..."; + VicClean, desc="..."; + CPUData, desc="WB data from CPU"; + StaleWB, desc="WB response for a no longer valid request"; + + // probe responses + CPUPrbResp, desc="Probe Response Msg"; + LastCPUPrbResp, desc="Last Probe Response Msg"; + + ProbeAcksComplete, desc="Probe Acks Complete"; + + L3Hit, desc="Hit in L3 return data to core"; + + // Memory Controller + MemData, desc="Fetched data from memory arrives"; + WBAck, desc="Writeback Ack from memory arrives"; + + CoreUnblock, desc="Core received data, unblock"; + UnblockWriteThrough, desc="unblock, self triggered"; + + StaleVicDirty, desc="Core invalidated before VicDirty processed"; + StaleVicDirtyP, desc="Core invalidated before VicDirty processed"; + + // For region protocol + CPUReq, desc="Generic CPU request"; + Inv, desc="Region dir needs a block invalidated"; + Downgrade, desc="Region dir needs a block downgraded"; + + // For private accesses (bypassed reg-dir) + CPUReadP, desc="Initial req from core, sent to L3"; + CPUWriteP, desc="Initial req from core, sent to L3"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L3DataArrayRead, desc="Read the data array"; + L3DataArrayWrite, desc="Write the data array"; + L3TagArrayRead, desc="Read the data array"; + L3TagArrayWrite, desc="Write the data array"; + } + + // TYPES + + // DirectoryEntry + structure(Entry, desc="...", interface="AbstractEntry") { + State DirectoryState, desc="Directory state"; + DataBlock DataBlk, desc="data for the block"; + NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; + } + + structure(CacheEntry, desc="...", interface="AbstractCacheEntry") { + DataBlock DataBlk, desc="data for the block"; + MachineID LastSender, desc="Mach which this block came from"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + DataBlock DataBlkAux, desc="Auxiliary data for the block"; + bool Dirty, desc="Is the data dirty?"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID WTRequestor, desc="WT Requestor"; + bool Cached, desc="data hit in Cache"; + bool MemData, desc="Got MemData?",default="false"; + bool wtData, desc="Got write through data?",default="false"; + bool atomicData, desc="Got Atomic op?",default="false"; + Cycles InitialRequestTime, desc="..."; + Cycles ForwardRequestTime, desc="..."; + Cycles ProbeRequestStartTime, desc="..."; + bool DemandRequest, desc="for profiling"; + MachineID LastSender, desc="Mach which this block came from"; + bool L3Hit, default="false", desc="Was this an L3 hit?"; + bool TriggeredAcksComplete, default="false", desc="True if already triggered acks complete"; + WriteMask writeMask, desc="outstanding write through mask"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_tbe(TBE a); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" { + Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr)); + + if (is_valid(dir_entry)) { + //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk); + return dir_entry; + } + + dir_entry := static_cast(Entry, "pointer", + directory.allocate(addr, new Entry)); + return dir_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if (is_valid(tbe) && tbe.MemData) { + DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); + return tbe.DataBlk; + } + DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); + return getDirectoryEntry(addr).DataBlk; + } + + State getState(TBE tbe, CacheEntry entry, Addr addr) { + return getDirectoryEntry(addr).DirectoryState; + } + + State getStateFromAddr(Addr addr) { + return getDirectoryEntry(addr).DirectoryState; + } + + void setState(TBE tbe, CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).DirectoryState := state; + } + + AccessPermission getAccessPermission(Addr addr) { + // For this Directory, all permissions are just tracked in Directory, since + // it's not possible to have something in TBE but not Dir, just keep track + // of state all in one place. + if(directory.isPresent(addr)) { + return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void setAccessPermission(CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state)); + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // ** OUT_PORTS ** + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); + out_port(responseNetwork_out, ResponseMsg, responseToCore); + + out_port(requestNetworkReg_out, CPURequestMsg, reqToRegDir); + out_port(regAckNetwork_out, UnblockMsg, unblockToRegDir); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=7) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe); + } else if (in_msg.Type == TriggerType:UnblockWriteThrough) { + trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=6) { + if (L3TriggerQueue_in.isReady(clockEdge())) { + peek(L3TriggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:L3Hit) { + trigger(Event:L3Hit, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + // Unblock Network + in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=5) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + trigger(Event:CoreUnblock, in_msg.addr, entry, tbe); + } + } + } + + // Core response network + in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=4) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + DPRINTF(RubySlicc, "core responses %s\n", in_msg); + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + if (is_valid(tbe) && tbe.NumPendingAcks == 1 + && tbe.TriggeredAcksComplete == false) { + trigger(Event:LastCPUPrbResp, in_msg.addr, entry, tbe); + } else { + trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:CPUData) { + trigger(Event:CPUData, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, entry, tbe); + } else { + error("Unexpected response type"); + } + } + } + } + + // off-chip memory request/response is done + in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=3) { + if (memQueue_in.isReady(clockEdge())) { + peek(memQueue_in, MemoryMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == MemoryRequestType:MEMORY_READ) { + trigger(Event:MemData, in_msg.addr, entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { + trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + } else { + DPRINTF(RubySlicc, "%s\n", in_msg.Type); + error("Invalid message"); + } + } + } + } + + in_port(regBuf_in, CPURequestMsg, reqFromRegBuf, rank=2) { + if (regBuf_in.isReady(clockEdge())) { + peek(regBuf_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceRequestType:ForceInv) { + trigger(Event:Inv, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:ForceDowngrade) { + trigger(Event:Downgrade, in_msg.addr, entry, tbe); + } else { + error("Bad request from region buffer"); + } + } + } + } + + in_port(regDir_in, CPURequestMsg, reqFromRegDir, rank=1) { + if (regDir_in.isReady(clockEdge())) { + peek(regDir_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThrough, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + trigger(Event:VicDirty, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + trigger(Event:VicClean, in_msg.addr, entry, tbe); + } + } else { + error("Bad message type fwded from Region Dir"); + } + } + } + } + + in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Private) { + // Bypass the region dir + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlkP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkSP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkMP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:AtomicP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThroughP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirtyP for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicDirtyP, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicCleanP for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicCleanP, in_msg.addr, entry, tbe); + } + } else { + error("Bad message type for private access"); + } + } else { + trigger(Event:CPUReq, in_msg.addr, entry, tbe); + } + } + } + } + + // Actions + action(s_sendResponseS, "s", desc="send Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := tbe.DemandRequest; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := tbe.DemandRequest; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(m_sendResponseM, "m", desc="send Modified response") { + if (tbe.wtData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } else { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := tbe.DemandRequest; + out_msg.L3Hit := tbe.L3Hit; + if (tbe.atomicData) { + out_msg.WTRequestor := tbe.WTRequestor; + } + DPRINTF(RubySlicc, "%s\n", out_msg); + } + if (tbe.atomicData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + } + + action(sb_sendResponseSBypass, "sb", desc="send Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(esb_sendResponseESBypass, "esb", desc="send Exclusive or Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached || in_msg.ForceShared) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(mbwt_sendResponseWriteThroughBypass, "mbwt", desc="send write through response") { + peek(requestNetwork_in, CPURequestMsg) { + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + } + } else { + assert(in_msg.Type == CoherenceRequestType:Atomic); + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := getDirectoryEntry(address).DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := in_msg.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + out_msg.WTRequestor := in_msg.WTRequestor; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + + action(mb_sendResponseMBypass, "mb", desc="send Modified response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(c_sendResponseCtoD, "c", desc="send CtoD Ack") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.DemandRequest := tbe.DemandRequest; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(cp_sendResponseCtoDP, "cp", desc="send CtoD Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(regDir_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + } + } + } + + action(wp_sendResponseWBAckP, "wp", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + } + } + } + + action(wc_sendResponseWBAck, "wc", desc="send WB Ack for cancel") { + peek(responseNetwork_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Sender); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(ra_ackRegionDir, "ra", desc="Ack region dir") { + peek(regDir_in, CPURequestMsg) { + if (in_msg.NoAckNeeded == false) { + enqueue(responseNetwork_out, ResponseMsg, response_latency_regionDir) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DirReadyAck; + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + } + + action(l_queueMemRdReq, "lr", desc="Read data from memory") { + peek(regDir_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk); + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(lrp_queueMemRdReqP, "lrp", desc="Read data from memory") { + peek(requestNetwork_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk); + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(dcr_probeInvCoreData, "dcr", desc="probe inv cores, return data") { + peek(regBuf_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination := in_msg.Sharers; + tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count(); + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ddr_probeDownCoreData, "ddr", desc="probe inv cores, return data") { + peek(regBuf_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination := in_msg.Sharers; + tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count(); + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1; + out_msg.Destination.broadcast(MachineType:TCP); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP); + out_msg.Destination.broadcast(MachineType:SQC); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC); + out_msg.Destination.remove(in_msg.Requestor); + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1; + out_msg.Destination.broadcast(MachineType:TCP); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP); + out_msg.Destination.broadcast(MachineType:SQC); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC); + out_msg.Destination.remove(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(d_writeDataToMemory, "d", desc="Write data to memory") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk, + in_msg.addr); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(regDir_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask); + tbe.Dirty := false; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.ForwardRequestTime := curCycle(); + tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + tbe.DemandRequest := in_msg.DemandRequest; + } + } + + action(tp_allocateTBEP, "tp", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(requestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask); + tbe.Dirty := false; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.ForwardRequestTime := curCycle(); + tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + tbe.DemandRequest := false; + } + } + + action(sa_setAcks, "sa", desc="setAcks") { + peek(regDir_in, CPURequestMsg) { + tbe.NumPendingAcks := in_msg.Acks; + APPEND_TRANSITION_COMMENT(" waiting for acks "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + } + + action(tr_allocateTBE, "tr", desc="allocate TBE Entry for Region inv") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.NumPendingAcks := 0; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(wdp_writeBackDataPrivate, "wdp", desc="Write back data if needed") { + peek(requestNetwork_in, CPURequestMsg) { + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlkAux := getDirectoryEntry(address).DataBlk; + tbe.DataBlkAux.copyPartial(in_msg.DataBlk,in_msg.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlkAux; + } else{ + assert(in_msg.Type == CoherenceRequestType:Atomic); + tbe.DataBlkAux.atomicPartial(getDirectoryEntry(address).DataBlk,in_msg.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlkAux; + } + } + } + + action(wd_writeBackData, "wd", desc="Write back data if needed") { + if (tbe.wtData) { + DataBlock tmp := getDirectoryEntry(address).DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.Dirty == true) { + APPEND_TRANSITION_COMMENT(" Wrote data back "); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + } + + action(wdi_writeBackDataInv, "wdi", desc="Write back inv data if needed") { + // Kind of opposite from above...? + if (tbe.Dirty == true) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + APPEND_TRANSITION_COMMENT("Writing dirty data to dir"); + DPRINTF(RubySlicc, "Data %s: %s\n", address, tbe.DataBlk); + } else { + APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir"); + } + } + + action(wdt_writeBackDataInvNoTBE, "wdt", desc="Write back inv data if needed no TBE") { + // Kind of opposite from above...? + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty == true) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + APPEND_TRANSITION_COMMENT("Writing dirty data to dir"); + DPRINTF(RubySlicc, "Data %s: %s\n", address, in_msg.DataBlk); + } else { + APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir"); + } + } + } + + action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { + peek(memQueue_in, MemoryMsg) { + if (tbe.Dirty == false) { + tbe.DataBlk := getDirectoryEntry(address).DataBlk; + } + tbe.MemData := true; + } + } + + action(ml_writeL3DataToTBE, "ml", desc="write L3 data to TBE") { + assert(tbe.Dirty == false); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + } + + action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender); + DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk); + if (tbe.wtData) { + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + } else if (tbe.Dirty) { + if(tbe.atomicData == false && tbe.wtData == false) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + } else { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.LastSender := in_msg.Sender; + } + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(yc_writeCPUDataToTBE, "yc", desc="write CPU Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender); + DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk); + if (tbe.Dirty) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := false; + tbe.LastSender := in_msg.Sender; + } + } + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + if (tbe.NumPendingAcks > 0) { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } else { + APPEND_TRANSITION_COMMENT(" Double ack! "); + } + assert(tbe.NumPendingAcks >= 0); + APPEND_TRANSITION_COMMENT(" Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(o_checkForCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + tbe.TriggeredAcksComplete := true; + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(ont_checkForCompletionNoTrigger, "ont", desc="check for ack completion, no trigger") { + if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) { + tbe.TriggeredAcksComplete := true; + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(rvp_removeVicDirtyIgnore, "rvp", desc="Remove ignored core") { + peek(requestNetwork_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") { + peek(regDir_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(r_sendRequestToRegionDir, "r", desc="send request to Region Directory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetworkReg_out, CPURequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "out dest: %s\n", map_Address_to_RegionDir(address)); + } + } + } + + action(ai_ackInvalidate, "ai", desc="Ack to let the reg-dir know that the inv is ordered") { + peek(regBuf_in, CPURequestMsg) { + enqueue(regAckNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg); + } + } + } + + action(aic_ackInvalidate, "aic", desc="Ack to let the reg-dir know that the inv is ordered") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.NoAckNeeded == false) { + enqueue(regAckNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + if (machineIDToMachineType(in_msg.Sender) == MachineType:CorePair) { + out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(0))); + } else { + out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(1))); + } + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg); + out_msg.wasValid := in_msg.isValid; + } + } + } + } + + action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { + peek(responseNetwork_in, ResponseMsg) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } + } + } + + action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { + if ((tbe.wtData || tbe.atomicData) && useL3OnWT) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(ali_allocateL3Block, "ali", desc="allocate the L3 block on ForceInv") { + if (tbe.Dirty == true) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(ali_allocateL3BlockNoTBE, "alt", desc="allocate the L3 block on ForceInv no TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" ali wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" ali wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } + } + } + } + + action(dl_deallocateL3, "dl", desc="deallocate the L3 block") { + L3CacheMemory.deallocate(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(prd_popRegionQueue, "prd", desc="pop request queue") { + regDir_in.dequeue(clockEdge()); + } + + action(prb_popRegionBufQueue, "prb", desc="pop request queue") { + regBuf_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pm_popMemQueue, "pm", desc="pop mem queue") { + memQueue_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") { + L3TriggerQueue_in.dequeue(clockEdge()); + } + + action(pu_popUnblockQueue, "pu", desc="pop unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(yy_recycleResponseQueue, "yy", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(ww_stallAndWaitRegRequestQueue, "ww", desc="recycle region dir request queue") { + stall_and_wait(regDir_in, address); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(requestNetwork_in, address); + } + + action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") { + wakeUpBuffers(address); + } + + action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(z_stall, "z", desc="...") { + } + + // TRANSITIONS + + // transitions from U + + transition({BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Inv, Downgrade}) { + ww_stallAndWaitRegRequestQueue; + } + + transition(U, Inv, BI){L3TagArrayRead} { + tr_allocateTBE; + dcr_probeInvCoreData; // only need to invalidate sharers + ai_ackInvalidate; + prb_popRegionBufQueue; + } + + transition(U, Downgrade, BI){L3TagArrayRead} { + tr_allocateTBE; + ddr_probeDownCoreData; // only need to invalidate sharers + ai_ackInvalidate; + prb_popRegionBufQueue; + } + + // The next 2 transistions are needed in the event that an invalidation + // is waiting for its ack from the core, but the event makes it through + // the region directory before the acks. This wouldn't be needed if + // we waited to ack the region dir until the directory got all the acks + transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, WriteThrough, Atomic}) { + ww_stallAndWaitRegRequestQueue; + } + + transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {RdBlkSP, RdBlkMP, RdBlkP}) { + st_stallAndWaitRequest; + } + + transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {WriteThroughP,AtomicP}) { + st_stallAndWaitRequest; + } + + transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, WriteThrough, BM_PM){L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, {RdBlkM,Atomic}, BM_PM){L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, RdBlk, B_PM){L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, {RdBlkSP}, BS_M) {L3TagArrayRead} { + tp_allocateTBEP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, WriteThroughP, BM_M) {L3TagArrayRead} { + tp_allocateTBEP; + wp_sendResponseWBAckP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, {RdBlkMP,AtomicP}, BM_M) {L3TagArrayRead} { + tp_allocateTBEP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, RdBlkP, B_M) {L3TagArrayRead} { + tp_allocateTBEP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, VicDirtyP, BL) {L3TagArrayRead} { + tp_allocateTBEP; + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(U, VicCleanP, BL) {L3TagArrayRead} { + tp_allocateTBEP; + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BM_Pm, RdBlkSP, BM_Pm_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(BS_Pm, RdBlkSP, BS_Pm_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(B_Pm, RdBlkSP, B_Pm_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(BP, RdBlkSP, BP_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(BM_Pm, RdBlkMP, BM_Pm_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(BS_Pm, RdBlkMP, BS_Pm_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(B_Pm, RdBlkMP, B_Pm_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(BP, RdBlkMP, BP_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(BM_Pm, {WriteThroughP,AtomicP}, BM_Pm_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(BS_Pm, {WriteThroughP,AtomicP}, BS_Pm_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(B_Pm, {WriteThroughP,AtomicP}, B_Pm_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(BP, {WriteThroughP,AtomicP}, BP_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(BM_Pm, RdBlkP, BM_Pm_B) {L3DataArrayWrite} { + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(BS_Pm, RdBlkP, BS_Pm_B) {L3DataArrayWrite} { + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(B_Pm, RdBlkP, B_Pm_B) {L3DataArrayWrite}{ + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(BP, RdBlkP, BP_B) {L3DataArrayWrite}{ + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(BM_Pm_B, CoreUnblock, BM_Pm) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(BS_Pm_B, CoreUnblock, BS_Pm) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(B_Pm_B, CoreUnblock, B_Pm) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(BP_B, CoreUnblock, BP) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(BM_Pm_B, UnblockWriteThrough, BM_Pm) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_Pm_B, UnblockWriteThrough, BS_Pm) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(B_Pm_B, UnblockWriteThrough, B_Pm) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BP_B, UnblockWriteThrough, BP) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BM_Pm, VicDirtyP, BM_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BS_Pm, VicDirtyP, BS_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(B_Pm, VicDirtyP, B_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BP, VicDirtyP, BP_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BM_Pm, VicCleanP, BM_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BS_Pm, VicCleanP, BS_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(B_Pm, VicCleanP, B_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BP, VicCleanP, BP_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BM_Pm_BL, CPUData, BM_Pm) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(BS_Pm_BL, CPUData, BS_Pm) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(B_Pm_BL, CPUData, B_Pm) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(BP_BL, CPUData, BP) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition({BR, BW, BL}, {VicDirtyP, VicCleanP}) { + st_stallAndWaitRequest; + } + + transition({BR, BW, BL}, {VicDirty, VicClean}) { + ww_stallAndWaitRegRequestQueue; + } + + transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + dt_deallocateTBE; + d_writeDataToMemory; + al_allocateL3Block; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(BL, StaleWB, U) {L3TagArrayWrite} { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pr_popResponseQueue; + } + + transition({BI, B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirty, VicClean}) { + ww_stallAndWaitRegRequestQueue; + } + + transition({BI, B, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirtyP, VicCleanP}) { + st_stallAndWaitRequest; + } + + transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, WBAck) { + pm_popMemQueue; + } + + transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirtyP) { + rvp_removeVicDirtyIgnore; + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirty) { + rv_removeVicDirtyIgnore; + w_sendResponseWBAck; + prd_popRegionQueue; + } + + transition(U, VicDirty, BL) {L3TagArrayRead} { + t_allocateTBE; + ra_ackRegionDir; + w_sendResponseWBAck; + prd_popRegionQueue; + } + + transition(U, VicClean, BL) {L3TagArrayRead} { + t_allocateTBE; + ra_ackRegionDir; + w_sendResponseWBAck; + prd_popRegionQueue; + } + + transition({B, BR}, CoreUnblock, U) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition({B, BR}, UnblockWriteThrough, U) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_M, MemData, B) {L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BS_PM, MemData, BS_Pm) {} { + mt_writeMemDataToTBE; + wa_wakeUpDependents; + pm_popMemQueue; + } + + transition(BM_PM, MemData, BM_Pm){} { + mt_writeMemDataToTBE; + wa_wakeUpDependents; + pm_popMemQueue; + } + + transition(B_PM, MemData, B_Pm){} { + mt_writeMemDataToTBE; + wa_wakeUpDependents; + pm_popMemQueue; + } + + transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BM_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(B_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BS_PM, L3Hit, BS_Pm) { + wa_wakeUpDependents; + ptl_popTriggerQueue; + } + + transition(BM_PM, L3Hit, BM_Pm) { + wa_wakeUpDependents; + ptl_popTriggerQueue; + } + + transition(B_PM, L3Hit, B_Pm) { + wa_wakeUpDependents; + ptl_popTriggerQueue; + } + + transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP, BI}, CPUPrbResp) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition({B, B_M, BS_M, BM_M}, {CPUPrbResp, LastCPUPrbResp}) { + z_stall; + } + + transition({BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {CPUPrbResp, LastCPUPrbResp}) { + // recycling because PrbResponse and data come on the same network + yy_recycleResponseQueue; + } + + transition(U, {CPUPrbResp, LastCPUPrbResp}) {L3TagArrayRead, L3DataArrayWrite} { + aic_ackInvalidate; + wdt_writeBackDataInvNoTBE; + ali_allocateL3BlockNoTBE; + pr_popResponseQueue; + } + + transition(BL, {CPUPrbResp, LastCPUPrbResp}) {} { + aic_ackInvalidate; + y_writeProbeDataToTBE; + wdi_writeBackDataInv; + ali_allocateL3Block; + pr_popResponseQueue; + } + + transition(BS_PM, LastCPUPrbResp, BS_M) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition(BS_PM, ProbeAcksComplete, BS_M) {} { + pt_popTriggerQueue; + } + + transition(BM_PM, LastCPUPrbResp, BM_M) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition(BM_PM, ProbeAcksComplete, BM_M) {} { + pt_popTriggerQueue; + } + + transition(B_PM, LastCPUPrbResp, B_M) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition(B_PM, ProbeAcksComplete, B_M){} { + pt_popTriggerQueue; + } + + transition(BS_Pm, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BM_Pm, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_Pm, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BP, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BI, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + wa_wakeUpDependents; + wdi_writeBackDataInv; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BI, ProbeAcksComplete, U) {L3TagArrayWrite, L3DataArrayWrite}{ + wa_wakeUpDependents; + wdi_writeBackDataInv; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm new file mode 100644 index 000000000..823933e57 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +enumeration(CoherenceRequestType, desc="Coherence Request Types") { + // CPU Request Types ONLY + RdBlk, desc="Read Blk"; + RdBlkM, desc="Read Blk Modified"; + RdBlkS, desc="Read Blk Shared"; + VicClean, desc="L2 clean eviction"; + VicDirty, desc="L2 dirty eviction"; + + WrCancel, desc="want to cancel WB to Memory"; // should this be here? + + WBApproval, desc="WB Approval"; + + // Messages between Dir and R-Dir + ForceInv, desc="Send invalide to the block"; + ForceDowngrade, desc="Send downgrade to the block"; + Unblock, desc="Used to let the dir know a message has been sunk"; + + // Messages between R-Dir and R-Buffer + PrivateNotify, desc="Let region buffer know it has private access"; + SharedNotify, desc="Let region buffer know it has shared access"; + WbNotify, desc="Let region buffer know it saw its wb request"; + Downgrade, desc="Force the region buffer to downgrade to shared"; + // Response to R-Dir (probably should be on a different network, but + // I need it to be ordered with respect to requests) + InvAck, desc="Let the R-Dir know when the inv has occured"; + + PrivateRequest, desc="R-buf wants the region in private"; + UpgradeRequest, desc="R-buf wants the region in private"; + SharedRequest, desc="R-buf wants the region in shared (could respond with private)"; + CleanWbRequest, desc="R-buf wants to deallocate clean region"; + + NA, desc="So we don't get segfaults"; +} + +enumeration(ProbeRequestType, desc="Probe Request Types") { + PrbDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS + PrbInv, desc="Probe to Invalidate"; + + // For regions + PrbRepl, desc="Force the cache to do a replacement"; + PrbRegDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS +} + + +enumeration(CoherenceResponseType, desc="Coherence Response Types") { + NBSysResp, desc="Northbridge response to CPU Rd request"; + NBSysWBAck, desc="Northbridge response ok to WB"; + TDSysResp, desc="TCCdirectory response to CPU Rd request"; + TDSysWBAck, desc="TCCdirectory response ok to WB"; + TDSysWBNack, desc="TCCdirectory response ok to drop"; + CPUPrbResp, desc="CPU Probe Response"; + CPUData, desc="CPU Data"; + StaleNotif, desc="Notification of Stale WBAck, No data to writeback"; + CPUCancelWB, desc="want to cancel WB to Memory"; + MemData, desc="Data from Memory"; + + // for regions + PrivateAck, desc="Ack that r-buf received private notify"; + RegionWbAck, desc="Writeback Ack that r-buf completed deallocation"; + DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake"; +} + +enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") { + Modified, desc="Modified"; + Owned, desc="Owned state"; + Exclusive, desc="Exclusive"; + Shared, desc="Shared"; + NA, desc="NA"; +} + +structure(CPURequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + Addr DemandAddress, desc="Physical block address for this request"; + CoherenceRequestType Type, desc="Type of request"; + DataBlock DataBlk, desc="data for the cache line"; // only for WB + bool Dirty, desc="whether WB data is dirty"; // only for WB + MachineID Requestor, desc="Node who initiated the request"; + NetDest Destination, desc="Multicast destination mask"; + bool Shared, desc="For CPU_WrVicBlk, vic is O not M. For CPU_ClVicBlk, vic is S"; + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + NetDest Sharers, desc="Caches that may have a valid copy of the data"; + bool ForceShared, desc="R-dir knows it is shared, pass on so it sends an S copy, not E"; + bool Private, default="false", desc="Requestor already has private permissions, no need for dir check"; + bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk"; + + bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack"; + int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive"; + CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer"; + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceRequestType:VicDirty) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(NBProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="probe signal"; + bool ReturnData, desc="Indicates CPU should return data"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer"; + Addr DemandAddress, desc="Demand block address for a region request"; + MachineID Requestor, desc="Requestor id for 3-hop requests"; + bool NoAckNeeded, default="false", desc="For short circuting acks"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} + +structure(TDProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="TD_PrbNxtState signal"; + bool ReturnData, desc="Indicates CPU should return data"; + bool localCtoD, desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + MachineID Sender, desc="Node who sent the data"; + bool currentOwner, default="false", desc="Is the sender the current owner"; + bool DoneAck, default="false", desc="Is this a done ack?"; + bool Dirty, default="false", desc="Was block dirty when evicted"; + bool wasValid, default="false", desc="Was block valid when evicted"; + bool valid, default="false", desc="Is block valid"; + bool validToInvalid, default="false", desc="Was block valid when evicted"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } +} + +// Response Messages seemed to be easily munged into one type +structure(ResponseMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + CoherenceResponseType Type, desc="NB Sys Resp or CPU Response to Probe"; + MachineID Sender, desc="Node who sent the data"; + NetDest Destination, desc="Node to whom the data is sent"; + // Begin Used Only By CPU Response + DataBlock DataBlk, desc="data for the cache line"; + bool Hit, desc="probe hit valid line"; + bool Shared, desc="True if S, or if NB Probe ReturnData==1 && O"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + bool Ntsl, desc="indicates probed lin will be invalid after probe"; + bool UntransferredOwner, desc="pending confirmation of ownership change"; + // End Used Only By CPU Response + + // Begin NB Response Only + CoherenceState State, default=CoherenceState_NA, desc="What returned data from NB should be in"; + bool CtoD, desc="was the originator a CtoD?"; + // End NB Response Only + + bool NbReqShared, desc="modification of Shared field from initial request, e.g. hit by shared probe"; + + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + bool L3Hit, default="false", desc="Did memory or L3 supply the data?"; + MachineID OriginalResponder, desc="Mach which wrote the data to the L3"; + + bool NotCached, default="false", desc="True when the Region buffer has already evicted the line"; + + bool NoAckNeeded, default="false", desc="For short circuting acks"; + bool isValid, default="false", desc="Is acked block valid"; + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceResponseType:CPUData || + Type == CoherenceResponseType:MemData) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(UnblockMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + NetDest Destination, desc="Destination (always directory)"; + MessageSizeType MessageSize, desc="size category of the message"; +} + +enumeration(TriggerType, desc="Trigger Type") { + L2_to_L1, desc="L2 to L1 fill"; + AcksComplete, desc="NB received all needed Acks"; + + // For regions + InvNext, desc="Invalidate the next block"; + PrivateAck, desc="Loopback ack for machines with no Region Buffer"; + AllOutstanding, desc="All outstanding requests have finished"; + L3Hit, desc="L3 hit in dir"; + + // For region directory once the directory is blocked + InvRegion, desc="Invalidate region"; + DowngradeRegion, desc="downgrade region"; +} + +enumeration(CacheId, desc="Which Cache in the Core") { + L1I, desc="L1 I-cache"; + L1D0, desc="L1 D-cache cluster 0"; + L1D1, desc="L1 D-cache cluster 1"; + NA, desc="Default"; +} + +structure(TriggerMsg, desc="...", interface="Message") { + Addr addr, desc="Address"; + TriggerType Type, desc="Type of trigger"; + CacheId Dest, default="CacheId_NA", desc="Cache to invalidate"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm new file mode 100644 index 000000000..89f7d6fcb --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm @@ -0,0 +1,1368 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Jason Power + */ + +machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol") +: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below + bool isOnCPU; + int blocksPerRegion := 64; // 4k regions + Cycles toDirLatency := 5; // Latency to fwd requests to directory + Cycles toRegionDirLatency := 5; // Latency for requests and acks to directory + Cycles nextEvictLatency := 1; // latency added between each block while evicting region + bool noTCCdir := "False"; + int TCC_select_num_bits := 1; + + // From the Cores + MessageBuffer * requestFromCore, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCore, network="From", virtual_network="2", vnet_type="response"; + + // Requests to the cores or directory + MessageBuffer * requestToNetwork, network="To", virtual_network="0", vnet_type="request"; + + // From Region-Dir + MessageBuffer * notifyFromRegionDir, network="From", virtual_network="7", vnet_type="request"; + MessageBuffer * probeFromRegionDir, network="From", virtual_network="8", vnet_type="request"; + + // From the directory + MessageBuffer * unblockFromDir, network="From", virtual_network="4", vnet_type="unblock"; + + // To the region-Dir + MessageBuffer * responseToRegDir, network="To", virtual_network="2", vnet_type="response"; + + MessageBuffer * triggerQueue; +{ + + // States + state_declaration(State, desc="Region states", default="RegionBuffer_State_NP") { + NP, AccessPermission:Invalid, desc="Not present in region directory"; + P, AccessPermission:Invalid, desc="Region is private to the cache"; + S, AccessPermission:Invalid, desc="Region is possibly shared with others"; + + NP_PS, AccessPermission:Invalid, desc="Intermediate state waiting for notify from r-dir"; + S_P, AccessPermission:Invalid, desc="Intermediate state while upgrading region"; + + P_NP, AccessPermission:Invalid, desc="Intermediate state while evicting all lines in region"; + P_S, AccessPermission:Invalid, desc="Intermediate state while downgrading all lines in region"; + + S_NP_PS, AccessPermission:Invalid, desc="Got an inv in S_P, waiting for all inv acks, then going to since the write is already out there NP_PS"; + P_NP_NP, AccessPermission:Invalid, desc="Evicting region on repl, then got an inv. Need to re-evict"; + + P_NP_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + P_S_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + S_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + S_NP_PS_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + + SS_P, AccessPermission:Invalid, desc="Waiting for CPU write that we know is there"; + + P_NP_W, AccessPermission:Invalid, desc="Waiting for writeback ack"; + + NP_W, AccessPermission:Invalid, desc="Got a done ack before request, waiting for that victim"; + } + + enumeration(Event, desc="Region directory events") { + CPURead, desc="Access from CPU core"; + CPUWrite, desc="Access from CPU core"; + CPUWriteback, desc="Writeback request from CPU core"; + + ReplRegion, desc="Start a replace on a region"; + + PrivateNotify, desc="Update entry to private state"; + SharedNotify, desc="Update entry to shared state"; + WbNotify, desc="Writeback notification received"; + InvRegion, desc="Start invalidating a region"; + DowngradeRegion,desc="Start invalidating a region"; + + InvAck, desc="Ack from core"; + + DoneAck, desc="Ack from core that request has finished"; + AllOutstanding, desc="All outstanding requests have now finished"; + + Evict, desc="Loopback to evict each block"; + LastAck_PrbResp, desc="Done eviciting all the blocks, got the last ack from core, now respond to region dir"; + LastAck_CleanWb, desc="Done eviciting all the blocks, got the last ack from core, now start clean writeback (note the dir has already been updated)"; + + StallAccess, desc="Wait for the done ack on the address before proceeding"; + StallDoneAck, desc="Wait for the access on the address before proceeding"; + + StaleRequest, desc="Got a stale victim from the cache, fwd it without incrementing outstanding"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + structure(BoolVec, external="yes") { + bool at(int); + void resize(int); + void clear(); + int size(); + } + + structure(Entry, desc="Region entry", interface="AbstractCacheEntry") { + Addr addr, desc="Base address of this region"; + State RegionState, desc="Region state"; + DataBlock DataBlk, desc="Data for the block (always empty in region buffer)"; + BoolVec ValidBlocks, desc="A vector to keep track of valid blocks"; + int NumValidBlocks, desc="Number of trues in ValidBlocks to avoid iterating"; + BoolVec UsedBlocks, desc="A vector to keep track of blocks ever valid"; + bool dirty, desc="Dirty as best known by the region buffer"; + // This is needed so we don't ack an invalidate until all requests are ordered + int NumOutstandingReqs, desc="Total outstanding private/shared requests"; + BoolVec OutstandingReqs, desc="Blocks that have outstanding private/shared requests"; + bool MustDowngrade, desc="Set when we got a downgrade before the shd or pvt permissions"; + Cycles ProbeRequestTime, default="Cycles(0)", desc="Time region dir started the probe"; + Cycles InitialRequestTime, default="Cycles(0)", desc="Time message was sent to region dir"; + bool MsgSentToDir, desc="True if the current request required a message to the dir"; + bool clearOnDone, default="false", desc="clear valid bit when request completes"; + Addr clearOnDoneAddr, desc="clear valid bit when request completes"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + //int NumValidBlocks, desc="Number of blocks valid so we don't have to count a BoolVec"; + BoolVec ValidBlocks, desc="A vector to keep track of valid blocks"; + bool AllAcksReceived, desc="Got all necessary acks from dir"; + bool DoneEvicting, desc="Done iterating through blocks checking for valids"; + BoolVec AcksReceived, desc="Received acks for theses blocks\n"; + bool SendAck, desc="If true, send an ack to the r-dir at end of inv"; + ProbeRequestType MsgType, desc="Type of message to send while 'evicting' "; + int NumOutstandingReqs, desc="Total outstanding private/shared requests"; + BoolVec OutstandingReqs, desc="Blocks that have outstanding private/shared requests"; + MachineID Requestor, desc="Requestor for three hop transactions"; + bool DemandRequest, default="false", desc="Associated with a demand request"; + Addr DemandAddress, desc="Address for the demand request"; + bool DoneAckReceived, default="false", desc="True if the done ack arrived before the message"; + Addr DoneAckAddr, desc="Address of the done ack received early"; + int OutstandingThreshold, desc="Number of outstanding requests to trigger AllOutstanding on"; + + ProbeRequestType NewMsgType, desc="Type of message to send while 'evicting' "; + MachineID NewRequestor, desc="Requestor for three hop transactions"; + bool NewDemandRequest, default="false", desc="Associated with a demand request"; + Addr NewDemandAddress, desc="Address for the demand request"; + bool dirty, desc="dirty"; + bool AllOutstandingTriggered, default="false", desc="bit for only one all outstanding"; + int OutstandingAcks, default="0", desc="number of acks to wait for"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + // Stores only region addresses + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + int blockBits, default="RubySystem::getBlockSizeBits()"; + int blockBytes, default="RubySystem::getBlockSizeBytes()"; + int regionBits, default="log2(m_blocksPerRegion)"; + + // Functions + + int getRegionOffset(Addr addr) { + if (blocksPerRegion > 1) { + Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1); + int ret := addressToInt(offset); + assert(ret < blocksPerRegion); + return ret; + } else { + return 0; + } + } + + Addr getRegionBase(Addr addr) { + return maskLowOrderBits(addr, blockBits+regionBits); + } + + Addr getNextBlock(Addr addr) { + Addr a := addr; + return makeNextStrideAddress(a, 1); + } + + MachineID getPeer(MachineID mach, Addr address) { + if (isOnCPU) { + return createMachineID(MachineType:CorePair, intToID(0)); + } else if (noTCCdir) { + return mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + } else { + return createMachineID(MachineType:TCCdir, intToID(0)); + } + } + + bool isOutstanding(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe) && tbe.OutstandingReqs.size() > 0) { + DPRINTF(RubySlicc, " outstanding tbe reqs %s %s %d %d\n", + tbe.OutstandingReqs, addr, getRegionOffset(addr), + tbe.OutstandingReqs.at(getRegionOffset(addr))); + return tbe.OutstandingReqs.at(getRegionOffset(addr)); + } else if (is_valid(cache_entry)) { + DPRINTF(RubySlicc, " outstanding cache reqs %s %s %d %d\n", + cache_entry.OutstandingReqs, addr, getRegionOffset(addr), + cache_entry.OutstandingReqs.at(getRegionOffset(addr))); + return cache_entry.OutstandingReqs.at(getRegionOffset(addr)); + } else { + return false; + } + } + + bool isOnGPU() { + if (isOnCPU) { + return false; + } + return true; + } + + bool isRead(CoherenceRequestType type) { + return (type == CoherenceRequestType:RdBlk || type == CoherenceRequestType:RdBlkS || + type == CoherenceRequestType:VicClean); + } + + bool presentOrAvail(Addr addr) { + return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr)); + } + + // Returns a region entry! + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr))); + } + + TBE getTBE(Addr addr), return_by_pointer="yes" { + return TBEs.lookup(getRegionBase(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(getRegionBase(addr)).DataBlk; + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.RegionState; + } + return State:NP; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + if (is_valid(cache_entry)) { + cache_entry.RegionState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := getTBE(addr); + if(is_valid(tbe)) { + return RegionBuffer_State_to_permission(tbe.TBEState); + } + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return RegionBuffer_State_to_permission(cache_entry.RegionState); + } + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + functionalMemoryRead(pkt); + } + + int functionalWrite(Addr addr, Packet *pkt) { + if (functionalMemoryWrite(pkt)) { + return 1; + } else { + return 0; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(RegionBuffer_State_to_permission(state)); + } + } + + void recordRequestType(RequestType stat, Addr addr) { + if (stat == RequestType:TagArrayRead) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (stat == RequestType:TagArrayWrite) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:TagArrayRead) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + // Overloaded outgoing request nework for both probes to cores and reqeusts + // to the directory. + // Fix Me: These forwarded requests need to be on a separate virtual channel + // to avoid deadlock! + out_port(requestNetwork_out, CPURequestMsg, requestToNetwork); + out_port(probeNetwork_out, NBProbeRequestMsg, requestToNetwork); + + out_port(responseNetwork_out, ResponseMsg, responseToRegDir); + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=4) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := getTBE(in_msg.addr); + DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr)); + assert(is_valid(tbe)); + if (in_msg.Type == TriggerType:AcksComplete) { + if (tbe.SendAck) { + trigger(Event:LastAck_PrbResp, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:LastAck_CleanWb, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == TriggerType:AllOutstanding) { + trigger(Event:AllOutstanding, in_msg.addr, cache_entry, tbe); + } else { + assert(in_msg.Type == TriggerType:InvNext); + trigger(Event:Evict, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(unblockNetwork_in, UnblockMsg, unblockFromDir, rank=3) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.DoneAck) { + if (isOutstanding(tbe, cache_entry, in_msg.addr)) { + trigger(Event:DoneAck, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:StallDoneAck, in_msg.addr, cache_entry, tbe); + } + } else { + assert(is_valid(tbe)); + trigger(Event:InvAck, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromRegionDir, rank=2) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + assert(getRegionBase(in_msg.addr) == in_msg.addr); + if (in_msg.Type == ProbeRequestType:PrbInv) { + trigger(Event:InvRegion, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + trigger(Event:DowngradeRegion, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown probe message\n"); + } + } + } + } + + in_port(notifyNetwork_in, CPURequestMsg, notifyFromRegionDir, rank=1) { + if (notifyNetwork_in.isReady(clockEdge())) { + peek(notifyNetwork_in, CPURequestMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + //Fix Me...add back in: assert(is_valid(cache_entry)); + if (in_msg.Type == CoherenceRequestType:WbNotify) { + trigger(Event:WbNotify, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:SharedNotify) { + trigger(Event:SharedNotify, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:PrivateNotify) { + trigger(Event:PrivateNotify, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown notify message\n"); + } + } + } + } + + // In from cores + // NOTE: We get the cache / TBE entry based on the region address, + // but pass the block address to the actions + in_port(requestNetwork_in, CPURequestMsg, requestFromCore, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (is_valid(tbe) && tbe.DoneAckReceived && tbe.DoneAckAddr == in_msg.addr) { + DPRINTF(RubySlicc, "Stale/Stall request %s\n", in_msg.Type); + if (in_msg.Type == CoherenceRequestType:VicDirty || in_msg.Type == CoherenceRequestType:VicClean ) + { + trigger(Event:StaleRequest, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe); + } + } else if (isOutstanding(tbe, cache_entry, in_msg.addr)) { + DPRINTF(RubySlicc, "Stall outstanding request %s\n", in_msg.Type); + trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe); + } else { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Type == CoherenceRequestType:RdBlkM ) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough ) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic ) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else { + if (in_msg.Type == CoherenceRequestType:VicDirty || + in_msg.Type == CoherenceRequestType:VicClean) { + trigger(Event:CPUWriteback, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPURead, in_msg.addr, cache_entry, tbe); + } + } + } else { + Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr)); + TBE victim_tbe := getTBE(victim); + Entry victim_entry := getCacheEntry(victim); + DPRINTF(RubySlicc, "Replacing region %s for %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr)); + trigger(Event:ReplRegion, victim, victim_entry, victim_tbe); + } + } + } + } + } + + // Actions + action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; + out_msg.Type := in_msg.Type; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := in_msg.Requestor; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := true; + out_msg.InitialRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + if (getState(tbe, cache_entry, address) == State:S) { + out_msg.ForceShared := true; + } + DPRINTF(RubySlicc, "Fwd: %s\n", out_msg); + //assert(getState(tbe, cache_entry, address) == State:P || getState(tbe, cache_entry, address) == State:S); + if (getState(tbe, cache_entry, address) == State:NP_W) { + APPEND_TRANSITION_COMMENT(" fwding stale request: "); + APPEND_TRANSITION_COMMENT(out_msg.Type); + } + } + } + } + + action(u_updateRegionEntry, "u", desc="Update the entry for profiling") { + peek(requestNetwork_in, CPURequestMsg) { + if (is_valid(cache_entry)) { + if (in_msg.CtoDSinked == false) { + APPEND_TRANSITION_COMMENT(" incr outstanding "); + cache_entry.NumOutstandingReqs := 1 + cache_entry.NumOutstandingReqs; + assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)) == false); + cache_entry.OutstandingReqs.at(getRegionOffset(address)) := true; + assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs)); + } else { + APPEND_TRANSITION_COMMENT(" NOT incr outstanding "); + assert(in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:RdBlkS); + } + APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs); + if (in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:Atomic || + in_msg.Type == CoherenceRequestType:WriteThrough ) + { + cache_entry.dirty := true; + } + if (in_msg.Type == CoherenceRequestType:VicDirty || + in_msg.Type == CoherenceRequestType:VicClean) { + DPRINTF(RubySlicc, "Got %s for addr %s\n", in_msg.Type, address); + //assert(cache_entry.ValidBlocks.at(getRegionOffset(address))); + // can in fact be inv if core got an inv after a vicclean before it got here + if (cache_entry.ValidBlocks.at(getRegionOffset(address))) { + cache_entry.clearOnDone := true; + cache_entry.clearOnDoneAddr := address; + //cache_entry.ValidBlocks.at(getRegionOffset(address)) := false; + //cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1; + } + } else { + if (cache_entry.ValidBlocks.at(getRegionOffset(address)) == false) { + cache_entry.NumValidBlocks := cache_entry.NumValidBlocks + 1; + } + DPRINTF(RubySlicc, "before valid addr %s bits %s\n", + in_msg.Type, address, cache_entry.ValidBlocks); + cache_entry.ValidBlocks.at(getRegionOffset(address)) := true; + DPRINTF(RubySlicc, "after valid addr %s bits %s\n", + in_msg.Type, address, cache_entry.ValidBlocks); + cache_entry.UsedBlocks.at(getRegionOffset(address)) := true; + } + assert(cache_entry.NumValidBlocks <= blocksPerRegion); + assert(cache_entry.NumValidBlocks >= 0); + APPEND_TRANSITION_COMMENT(" valid blocks "); + APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks); + } else { + error("This shouldn't happen anymore I think"); + //tbe.ValidBlocks.at(getRegionOffest(address)) := true; + assert(getState(tbe, cache_entry, address) == State:P_NP); + } + } + } + + action(uw_updatePossibleWriteback, "uw", desc="writeback request complete") { + peek(unblockNetwork_in, UnblockMsg) { + if (is_valid(cache_entry) && in_msg.validToInvalid && + cache_entry.clearOnDone && cache_entry.clearOnDoneAddr == address) { + DPRINTF(RubySlicc, "I have no idea what is going on here\n"); + cache_entry.ValidBlocks.at(getRegionOffset(address)) := false; + cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1; + cache_entry.clearOnDone := false; + } + } + } + + + action(rp_requestPrivate, "rp", desc="Send private request r-dir") { + peek(requestNetwork_in, CPURequestMsg) { + // No need to send acks on replacements + assert(is_invalid(tbe)); + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := address; // use the actual address so the demand request can be fulfilled + out_msg.DemandAddress := address; + out_msg.Type := CoherenceRequestType:PrivateRequest; + out_msg.OriginalType := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.InitialRequestTime := curCycle(); + // will this always be ok? probably not for multisocket + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + DPRINTF(RubySlicc, "Private request %s\n", out_msg); + } + cache_entry.ProbeRequestTime := curCycle(); + cache_entry.MsgSentToDir := true; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(ru_requestUpgrade, "ru", desc="Send upgrade request r-dir") { + peek(requestNetwork_in, CPURequestMsg) { + // No need to send acks on replacements + assert(is_invalid(tbe)); + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := address; // use the actual address so the demand request can be fulfilled + out_msg.Type := CoherenceRequestType:UpgradeRequest; + out_msg.OriginalType := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.InitialRequestTime := curCycle(); + // will this always be ok? probably not for multisocket + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + cache_entry.ProbeRequestTime := curCycle(); + cache_entry.MsgSentToDir := true; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(rw_requestWriteback, "rq", desc="Send writeback request") { + // No need to send acks on replacements + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); // use the actual address so the demand request can be fulfilled + out_msg.Type := CoherenceRequestType:CleanWbRequest; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Dirty := tbe.dirty; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(rs_requestShared, "rs", desc="Send shared request r-dir") { + peek(requestNetwork_in, CPURequestMsg) { + // No need to send acks on replacements + assert(is_invalid(tbe)); + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := address; // use the actual address so the demand request can be fulfilled + out_msg.Type := CoherenceRequestType:SharedRequest; + out_msg.OriginalType := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.InitialRequestTime := curCycle(); + // will this always be ok? probably not for multisocket + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + cache_entry.ProbeRequestTime := curCycle(); + cache_entry.MsgSentToDir := true; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(ai_ackRegionInv, "ai", desc="Send ack to r-dir on region inv if tbe says so") { + // No need to send acks on replacements + assert(is_valid(tbe)); + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ad_ackDircetory, "ad", desc="send probe response to directory") { + if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { //VIPER tcc doesnt understand PrbShrData + assert(tbe.DemandRequest); //So, let RegionBuffer take care of sending back ack + enqueue(responseNetwork_out, ResponseMsg, toDirLatency) { + out_msg.addr := tbe.DemandAddress; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := getPeer(machineID,address); + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.NoAckNeeded := true; + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(aie_ackRegionExclusiveInv, "aie", desc="Send ack to r-dir on region inv if tbe says so") { + // No need to send acks on replacements + assert(is_valid(tbe)); + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.NotCached := true; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := tbe.dirty; + } + } + + action(ain_ackRegionInvNow, "ain", desc="Send ack to r-dir on region inv") { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(aine_ackRegionInvExlusiveNow, "aine", desc="Send ack to r-dir on region inv with exlusive permission") { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.NotCached := true; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ap_ackPrivateNotify, "ap", desc="Send ack to r-dir on private notify") { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:PrivateAck; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(aw_ackWbNotify, "aw", desc="Send ack to r-dir on writeback notify") { + peek(notifyNetwork_in, CPURequestMsg) { + if (in_msg.NoAckNeeded == false) { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:RegionWbAck; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + } + } + + action(e_evictCurrent, "e", desc="Evict this block in the region") { + // send force invalidate message to directory to invalidate this block + // must invalidate all blocks since region buffer could have privitized it + if (tbe.ValidBlocks.at(getRegionOffset(address)) && + (tbe.DemandRequest == false || tbe.DemandAddress != address)) { + DPRINTF(RubySlicc, "trying to evict address %s (base: %s, offset: %d)\n", address, getRegionBase(address), getRegionOffset(address)); + DPRINTF(RubySlicc, "tbe valid blocks %s\n", tbe.ValidBlocks); + + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := tbe.MsgType; + out_msg.ReturnData := true; + if (address == tbe.DemandAddress) { + out_msg.DemandRequest := true; + } + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(getPeer(machineID,address)); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + APPEND_TRANSITION_COMMENT(" current "); + APPEND_TRANSITION_COMMENT(tbe.ValidBlocks.at(getRegionOffset(address))); + tbe.AllAcksReceived := false; + } else { + DPRINTF(RubySlicc, "Not evicting demand %s\n", address); + } + } + + action(ed_evictDemand, "ed", desc="Evict the demand request if it's valid") { + if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { + tbe.OutstandingAcks := 0; + tbe.AllAcksReceived := true; + tbe.DoneEvicting := true; + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := getRegionBase(address); + } + } else if (tbe.DemandRequest) { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := tbe.DemandAddress; + out_msg.Type := tbe.MsgType; + out_msg.ReturnData := true; + out_msg.DemandRequest := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(getPeer(machineID,address)); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.AllAcksReceived := false; + } + if (tbe.ValidBlocks.at(getRegionOffset(tbe.DemandAddress)) == false) { + tbe.OutstandingAcks := tbe.OutstandingAcks + 1; + } + APPEND_TRANSITION_COMMENT("Evicting demand "); + APPEND_TRANSITION_COMMENT(tbe.DemandAddress); + } + APPEND_TRANSITION_COMMENT("waiting acks "); + APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks); + } + + action(adp_AckDemandProbe, "fp", desc="forward demand probe even if we know that the core is invalid") { + peek(probeNetwork_in, NBProbeRequestMsg) { + if (in_msg.DemandRequest) { + enqueue(responseNetwork_out, ResponseMsg, toDirLatency) { + out_msg.addr := in_msg.DemandAddress; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := getPeer(machineID,address); + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.NoAckNeeded := true; + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + } + + action(en_enqueueNextEvict, "en", desc="Queue evict the next block in the region") { + // increment in_msg.addr by blockSize bytes and enqueue on triggerPort + // Only enqueue if the next address doesn't overrun the region bound + if (getRegionBase(getNextBlock(address)) == getRegionBase(address)) { + enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) { + out_msg.Type := TriggerType:InvNext; + out_msg.addr := getNextBlock(address); + } + } else { + tbe.DoneEvicting := true; + DPRINTF(RubySlicc, "Done evicing region %s\n", getRegionBase(address)); + DPRINTF(RubySlicc, "Waiting for %s acks\n", tbe.OutstandingAcks); + if (tbe.AllAcksReceived == true) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := getRegionBase(address); + } + } + } + } + + action(ef_enqueueFirstEvict, "ef", desc="Queue the first block in the region to be evicted") { + if (tbe.DoneEvicting == false) { + enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) { + out_msg.Type := TriggerType:InvNext; + out_msg.addr := getRegionBase(address); + } + } + } + + action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") { + DPRINTF(RubySlicc, "received ack for %s reg: %s vec: %s pos: %d\n", + address, getRegionBase(address), tbe.ValidBlocks, getRegionOffset(address)); + peek(unblockNetwork_in, UnblockMsg) { + // + // Note the tbe ValidBlock vec will be a conservative list of the + // valid blocks since the cache entry ValidBlock vec is set on the + // request + // + if (in_msg.wasValid) { + assert(tbe.ValidBlocks.at(getRegionOffset(address))); + } + } + tbe.OutstandingAcks := tbe.OutstandingAcks - 1; + tbe.AcksReceived.at(getRegionOffset(address)) := true; + assert(tbe.OutstandingAcks >= 0); + if (tbe.OutstandingAcks == 0) { + tbe.AllAcksReceived := true; + if (tbe.DoneEvicting) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := getRegionBase(address); + } + } + } + + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" Acks left receive "); + APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks); + } + + action(do_decrementOutstanding, "do", desc="Decrement outstanding requests") { + APPEND_TRANSITION_COMMENT(" decr outstanding "); + if (is_valid(cache_entry)) { + cache_entry.NumOutstandingReqs := cache_entry.NumOutstandingReqs - 1; + assert(cache_entry.OutstandingReqs.at(getRegionOffset(address))); + cache_entry.OutstandingReqs.at(getRegionOffset(address)) := false; + assert(cache_entry.NumOutstandingReqs >= 0); + assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs)); + APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs); + } + if (is_valid(tbe)) { + tbe.NumOutstandingReqs := tbe.NumOutstandingReqs - 1; + assert(tbe.OutstandingReqs.at(getRegionOffset(address))); + tbe.OutstandingReqs.at(getRegionOffset(address)) := false; + assert(tbe.NumOutstandingReqs >= 0); + assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs)); + APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs); + } + } + + action(co_checkOutstanding, "co", desc="check if there are no more outstanding requests") { + assert(is_valid(tbe)); + if ((tbe.NumOutstandingReqs <= tbe.OutstandingThreshold) && + (tbe.AllOutstandingTriggered == false)) { + APPEND_TRANSITION_COMMENT(" no more outstanding: "); + APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs); + APPEND_TRANSITION_COMMENT(tbe.OutstandingThreshold); + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AllOutstanding; + if (tbe.DemandRequest) { + out_msg.addr := tbe.DemandAddress; + } else { + out_msg.addr := getRegionBase(address); + } + DPRINTF(RubySlicc, "co enqueuing %s\n", out_msg); + tbe.AllOutstandingTriggered := true; + } + } else { + APPEND_TRANSITION_COMMENT(" still more outstanding "); + } + } + + action(ro_resetAllOutstanding, "ro", desc="Reset all outstanding") { + tbe.AllOutstandingTriggered := false; + } + + action(so_setOutstandingCheckOne, "so", desc="Check outstanding is waiting for 1, not 0") { + // Need this for S_P because one request is outstanding between here and r-dir + tbe.OutstandingThreshold := 1; + } + + action(a_allocateRegionEntry, "a", desc="Allocate a new entry") { + set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry)); + cache_entry.ValidBlocks.clear(); + cache_entry.ValidBlocks.resize(blocksPerRegion); + cache_entry.UsedBlocks.clear(); + cache_entry.UsedBlocks.resize(blocksPerRegion); + cache_entry.dirty := false; + cache_entry.NumOutstandingReqs := 0; + cache_entry.OutstandingReqs.clear(); + cache_entry.OutstandingReqs.resize(blocksPerRegion); + } + + action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") { + cacheMemory.deallocate(getRegionBase(address)); + unset_cache_entry(); + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(getRegionBase(address)); + set_tbe(getTBE(address)); + tbe.OutstandingAcks := 0; + tbe.AllAcksReceived := true; // starts true since the region could be empty + tbe.DoneEvicting := false; + tbe.AcksReceived.clear(); + tbe.AcksReceived.resize(blocksPerRegion); + tbe.SendAck := false; + tbe.OutstandingThreshold := 0; + if (is_valid(cache_entry)) { + tbe.NumOutstandingReqs := cache_entry.NumOutstandingReqs; + tbe.OutstandingReqs := cache_entry.OutstandingReqs; + assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs)); + tbe.dirty := cache_entry.dirty; + tbe.ValidBlocks := cache_entry.ValidBlocks; + tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks); + APPEND_TRANSITION_COMMENT(" tbe valid blocks "); + APPEND_TRANSITION_COMMENT(tbe.ValidBlocks); + APPEND_TRANSITION_COMMENT(" cache valid blocks "); + APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks); + } else { + tbe.dirty := false; + } + } + + action(m_markSendAck, "m", desc="Mark TBE that we need to ack at end") { + assert(is_valid(tbe)); + tbe.SendAck := true; + } + + action(db_markDirtyBit, "db", desc="Mark TBE dirty bit") { + peek(unblockNetwork_in, UnblockMsg) { + if (is_valid(tbe)) { + tbe.dirty := tbe.dirty || in_msg.Dirty; + } + } + } + + action(dr_markDoneAckReceived, "dr", desc="Mark TBE that a done ack has been received") { + assert(is_valid(tbe)); + tbe.DoneAckReceived := true; + tbe.DoneAckAddr := address; + APPEND_TRANSITION_COMMENT(" marking done ack on TBE "); + } + + action(se_setTBE, "se", desc="Set msg type to evict") { + peek(probeNetwork_in, NBProbeRequestMsg) { + tbe.MsgType := in_msg.Type; + tbe.Requestor := in_msg.Requestor; + tbe.DemandAddress := in_msg.DemandAddress; + tbe.DemandRequest := in_msg.DemandRequest; + } + } + + action(sne_setNewTBE, "sne", desc="Set msg type to evict") { + peek(probeNetwork_in, NBProbeRequestMsg) { + tbe.NewMsgType := in_msg.Type; + tbe.NewRequestor := in_msg.Requestor; + tbe.NewDemandAddress := in_msg.DemandAddress; + tbe.NewDemandRequest := in_msg.DemandRequest; + } + } + + action(soe_setOldTBE, "soe", desc="Set msg type to evict") { + tbe.MsgType := tbe.NewMsgType; + tbe.Requestor := tbe.NewRequestor; + tbe.DemandAddress := tbe.NewDemandAddress; + tbe.DemandRequest := tbe.NewDemandRequest; + tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks); + tbe.AllAcksReceived := true; // starts true since the region could be empty + tbe.DoneEvicting := false; + tbe.AcksReceived.clear(); + tbe.AcksReceived.resize(blocksPerRegion); + tbe.SendAck := false; + } + + action(ser_setTBE, "ser", desc="Set msg type to evict repl") { + tbe.MsgType := ProbeRequestType:PrbInv; + } + + action(md_setMustDowngrade, "md", desc="When permissions finally get here, must be shared") { + assert(is_valid(cache_entry)); + cache_entry.MustDowngrade := true; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(getRegionBase(address)); + unset_tbe(); + } + + action(p_popRequestQueue, "p", desc="Pop the request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pl_popUnblockQueue, "pl", desc="Pop the unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(pn_popNotifyQueue, "pn", desc="Pop the notify queue") { + notifyNetwork_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="Pop the probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") { + DPRINTF(RubySlicc, "Trigger Before Contents: %s\n", triggerQueue_in); + triggerQueue_in.dequeue(clockEdge()); + DPRINTF(RubySlicc, "Trigger After Contents: %s\n", triggerQueue_in); + } + + // Must always use wake all, since non-region address wait on region addresses + action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(zz_stallAndWaitRequestQueue, "\z", desc="recycle request queue") { + Addr regAddr := getRegionBase(address); + DPRINTF(RubySlicc, "Stalling address %s\n", regAddr); + stall_and_wait(requestNetwork_in, regAddr); + } + + action(yy_stallAndWaitProbeQueue, "\y", desc="stall probe queue") { + Addr regAddr := getRegionBase(address); + stall_and_wait(probeNetwork_in, regAddr); + } + + action(yyy_recycleProbeQueue, "\yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zzz_recycleRequestQueue, "\zz", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(www_recycleUnblockNetwork, "\ww", desc="recycle unblock queue") { + unblockNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(z_stall, "z", desc="stall request queue") { + // fake state + } + + action(mru_setMRU, "mru", desc="set MRU") { + cacheMemory.setMRU(address, cache_entry.NumValidBlocks); + } + + // Transitions + + transition({NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, P_NP_W, P_NP_NP, NP_W}, {CPURead, CPUWriteback, CPUWrite}) {} { + zz_stallAndWaitRequestQueue; + } + + transition(SS_P, {CPURead, CPUWriteback}) { + zz_stallAndWaitRequestQueue; + } + + transition({NP, S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, NP_W, P_NP_NP}, StallAccess) {} { + zz_stallAndWaitRequestQueue; + } + + transition({S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, P_NP_W, P_NP_NP, NP_W}, StallDoneAck) { + www_recycleUnblockNetwork; + } + + transition(NP, StallDoneAck, NP_W) { + t_allocateTBE; + db_markDirtyBit; + dr_markDoneAckReceived; + pl_popUnblockQueue; + } + + transition(NP_W, StaleRequest, NP) { + f_fwdReqToDir; + dt_deallocateTBE; + wa_wakeUpAllDependents; + p_popRequestQueue; + } + + transition(P_NP_O, DowngradeRegion) {} { + z_stall; // should stall and wait + } + + transition({NP_PS, S_NP_PS, S_P, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P}, ReplRegion) {} { + zz_stallAndWaitRequestQueue; // can't let things get out of order! + } + + transition({P_NP_O, S_O, SS_P}, InvRegion) {} { + yyy_recycleProbeQueue; // can't be z_stall because there could be a RdBlkM in the requestQueue which has the sinked flag which is blocking the inv + } + + transition(P_NP, {InvRegion, DowngradeRegion}, P_NP_NP) {} { + sne_setNewTBE; + pp_popProbeQueue; + } + + transition(S_P, DowngradeRegion) {} { + adp_AckDemandProbe; + ain_ackRegionInvNow; + pp_popProbeQueue; + } + + transition(P_NP_W, InvRegion) { + adp_AckDemandProbe; + ain_ackRegionInvNow; + pp_popProbeQueue; + } + + transition(P_NP_W, DowngradeRegion) { + adp_AckDemandProbe; + aine_ackRegionInvExlusiveNow; + pp_popProbeQueue; + } + + transition({P, S}, {CPURead, CPUWriteback}) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + f_fwdReqToDir; + u_updateRegionEntry; + p_popRequestQueue; + } + + transition(P, CPUWrite) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + f_fwdReqToDir; + u_updateRegionEntry; + p_popRequestQueue; + } + + transition(S, CPUWrite, S_O) {TagArrayRead} { + mru_setMRU; + t_allocateTBE; + co_checkOutstanding; + zz_stallAndWaitRequestQueue; + } + + transition(S_O, AllOutstanding, SS_P) { + wa_wakeUpAllDependents; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition(SS_P, CPUWrite, S_P) { + mru_setMRU; + dt_deallocateTBE; + ru_requestUpgrade; + u_updateRegionEntry; + p_popRequestQueue; + } + + transition(NP, {CPURead, CPUWriteback}, NP_PS) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + rs_requestShared; + u_updateRegionEntry; + p_popRequestQueue;//zz_stallAndWaitRequestQueue; + } + + transition(NP, CPUWrite, NP_PS) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + rp_requestPrivate; + u_updateRegionEntry; + p_popRequestQueue;//zz_stallAndWaitRequestQueue; + } + + transition(NP_PS, PrivateNotify, P) {} { + ap_ackPrivateNotify; + wa_wakeUpAllDependents; + pn_popNotifyQueue; + } + + transition(S_P, PrivateNotify, P) {} { + ap_ackPrivateNotify; + wa_wakeUpAllDependents; + pn_popNotifyQueue; + } + + transition(NP_PS, SharedNotify, S) {} { + ap_ackPrivateNotify; + wa_wakeUpAllDependents; + pn_popNotifyQueue; + } + + transition(P_NP_W, WbNotify, NP) {} { + aw_ackWbNotify; + wa_wakeUpAllDependents; + dt_deallocateTBE; + pn_popNotifyQueue; + } + + transition({P, S}, ReplRegion, P_NP_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + ser_setTBE; + d_deallocateRegionEntry; + co_checkOutstanding; + } + + transition({P, S}, InvRegion, P_NP_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + se_setTBE; + m_markSendAck; + d_deallocateRegionEntry; + co_checkOutstanding; + pp_popProbeQueue; + } + + transition(P_NP_O, AllOutstanding, P_NP) {} { + ed_evictDemand; + ef_enqueueFirstEvict; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition(S_P, InvRegion, S_NP_PS_O) {TagArrayRead} { + t_allocateTBE; + se_setTBE; + m_markSendAck; + so_setOutstandingCheckOne; + co_checkOutstanding; + pp_popProbeQueue; + } + + transition(S_NP_PS_O, AllOutstanding, S_NP_PS) { + ed_evictDemand; + ef_enqueueFirstEvict; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition(P, DowngradeRegion, P_S_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + se_setTBE; + m_markSendAck; + co_checkOutstanding; + pp_popProbeQueue; + } + + transition(P_S_O, AllOutstanding, P_S) {} { + ed_evictDemand; + ef_enqueueFirstEvict; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition({P, S}, DoneAck) {TagArrayWrite} { + do_decrementOutstanding; + wa_wakeUpAllDependents; + db_markDirtyBit; + uw_updatePossibleWriteback; + pl_popUnblockQueue; + } + + transition({S_P, NP_PS, S_NP_PS}, DoneAck) {TagArrayWrite} { + www_recycleUnblockNetwork; + } + + transition({P_NP_O, S_NP_PS_O, P_S_O, S_O}, DoneAck) {} { + do_decrementOutstanding; + co_checkOutstanding; + db_markDirtyBit; + uw_updatePossibleWriteback; + pl_popUnblockQueue; + } + + transition({P_NP, P_S, S_NP_PS, P_NP_NP}, Evict) {} { + e_evictCurrent; + en_enqueueNextEvict; + pt_popTriggerQueue; + } + + transition({P_NP, P_S, S_NP_PS, P_NP_NP}, InvAck) {} { + ra_receiveAck; + db_markDirtyBit; + pl_popUnblockQueue; + } + + transition(P_NP, LastAck_CleanWb, P_NP_W) {} { + rw_requestWriteback; + pt_popTriggerQueue; + } + + transition(P_NP_NP, LastAck_CleanWb, P_NP) {} { + soe_setOldTBE; + m_markSendAck; + ed_evictDemand; + ef_enqueueFirstEvict; + pt_popTriggerQueue; + } + + transition(P_NP, LastAck_PrbResp, NP) {} { + aie_ackRegionExclusiveInv; + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + + transition(S_NP_PS, LastAck_PrbResp, NP_PS) {} { + aie_ackRegionExclusiveInv; + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + + transition(P_S, LastAck_PrbResp, S) {} { + ai_ackRegionInv; + ad_ackDircetory; + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + +} + diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm new file mode 100644 index 000000000..b392311c5 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm @@ -0,0 +1,1187 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Jason Power + */ + +machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol") +: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below + NodeID cpuRegionBufferNum; + NodeID gpuRegionBufferNum; + int blocksPerRegion := 64; // 4k regions + Cycles toDirLatency := 10; // Latency to fwd requests and send invs to directory + bool always_migrate := "False"; + bool sym_migrate := "False"; + bool asym_migrate := "False"; + bool noTCCdir := "False"; + int TCC_select_num_bits := 1; + + // To the directory + MessageBuffer * requestToDir, network="To", virtual_network="5", vnet_type="request"; + + // To the region buffers + MessageBuffer * notifyToRBuffer, network="To", virtual_network="7", vnet_type="request"; + MessageBuffer * probeToRBuffer, network="To", virtual_network="8", vnet_type="request"; + + // From the region buffers + MessageBuffer * responseFromRBuffer, network="From", virtual_network="2", vnet_type="response"; + MessageBuffer * requestFromRegBuf, network="From", virtual_network="0", vnet_type="request"; + + MessageBuffer * triggerQueue; +{ + + // States + state_declaration(State, desc="Region states", default="RegionDir_State_NP") { + NP, AccessPermission:Invalid, desc="Not present in region directory"; + P, AccessPermission:Invalid, desc="Region is private to owner"; + S, AccessPermission:Invalid, desc="Region is shared between CPU and GPU"; + + P_NP, AccessPermission:Invalid, desc="Evicting the region"; + NP_P, AccessPermission:Invalid, desc="Must wait for ack from R-buf"; + NP_S, AccessPermission:Invalid, desc="Must wait for ack from R-buf"; + P_P, AccessPermission:Invalid, desc="Waiting for ack from R-buf"; + S_S, AccessPermission:Invalid, desc="Waiting for ack from R-buf"; + P_S, AccessPermission:Invalid, desc="Downgrading the region"; + S_P, AccessPermission:Invalid, desc="Upgrading the region"; + P_AS, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks"; + S_AP, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks"; + P_AP, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks"; + + SP_NP_W, AccessPermission:Invalid, desc="Last sharer writing back, waiting for ack"; + S_W, AccessPermission:Invalid, desc="Sharer writing back, waiting for ack"; + + P_AP_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack"; + P_AS_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack"; + S_AP_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack"; + } + + enumeration(Event, desc="Region directory events") { + SendInv, desc="Send inv message to any machine that has a region buffer"; + SendUpgrade, desc="Send upgrade message to any machine that has a region buffer"; + SendDowngrade, desc="Send downgrade message to any machine that has a region buffer"; + + Evict, desc="Evict this region"; + + UpgradeRequest, desc="Request from r-buf for an upgrade"; + SharedRequest, desc="Request from r-buf for read"; + PrivateRequest, desc="Request from r-buf for write"; + + InvAckCore, desc="Ack from region buffer to order the invalidate"; + InvAckCoreNoShare, desc="Ack from region buffer to order the invalidate, and it does not have the region"; + CPUPrivateAck, desc="Ack from region buffer to order private notification"; + + LastAck, desc="Done eviciting all the blocks"; + + StaleCleanWbRequest, desc="stale clean writeback reqeust"; + StaleCleanWbRequestNoShare, desc="stale clean wb req from a cache which should be removed from sharers"; + CleanWbRequest, desc="clean writeback reqeust, multiple sharers"; + CleanWbRequest_LastSharer, desc="clean writeback reqeust, last sharer"; + WritebackAck, desc="Writeback Ack from region buffer"; + DirReadyAck, desc="Directory is ready, waiting Ack from region buffer"; + + TriggerInv, desc="trigger invalidate message"; + TriggerDowngrade, desc="trigger downgrade message"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + structure(BoolVec, external="yes") { + bool at(int); + void resize(int); + void clear(); + } + + structure(Entry, desc="Region entry", interface="AbstractCacheEntry") { + Addr addr, desc="Base address of this region"; + NetDest Sharers, desc="Set of machines that are sharing, but not owners"; + State RegionState, desc="Region state"; + DataBlock DataBlk, desc="Data for the block (always empty in region dir)"; + MachineID Owner, desc="Machine which owns all blocks in this region"; + Cycles ProbeStart, desc="Time when the first probe request was issued"; + bool LastWriten, default="false", desc="The last time someone accessed this region, it wrote it"; + bool LastWritenByCpu, default="false", desc="The last time the CPU accessed this region, it wrote it"; + bool LastWritenByGpu, default="false", desc="The last time the GPU accessed this region, it wrote it"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + MachineID Owner, desc="Machine which owns all blocks in this region"; + NetDest Sharers, desc="Set of machines to send evicts"; + int NumValidBlocks, desc="Number of blocks valid so we don't have to count a BoolVec"; + bool AllAcksReceived, desc="Got all necessary acks from dir"; + CoherenceRequestType MsgType, desc="Msg type for the evicts could be inv or dwngrd"; + Cycles ProbeRequestTime, default="Cycles(0)", desc="Start of probe request"; + Cycles InitialRequestTime, default="Cycles(0)", desc="To forward back on out msg"; + Addr DemandAddress, desc="Demand address from original request"; + uint64_t probe_id, desc="probe id for lifetime profiling"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + // Stores only region addresses + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + int blockBits, default="RubySystem::getBlockSizeBits()"; + int blockBytes, default="RubySystem::getBlockSizeBytes()"; + int regionBits, default="log2(m_blocksPerRegion)"; + + // Functions + + MachineID getCoreMachine(MachineID rBuf, Addr address) { + if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) { + return createMachineID(MachineType:CorePair, intToID(0)); + } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) { + if (noTCCdir) { + return mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + } else { + return createMachineID(MachineType:TCCdir, intToID(0)); + } + } else { + error("Unexpected region buffer number"); + } + } + + bool isCpuMachine(MachineID rBuf) { + if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) { + return true; + } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) { + return false; + } else { + error("Unexpected region buffer number"); + } + } + + bool symMigrate(Entry cache_entry) { + return cache_entry.LastWriten; + } + + bool asymMigrate(Entry cache_entry, MachineID requestor) { + if (isCpuMachine(requestor)) { + return cache_entry.LastWritenByCpu; + } else { + return cache_entry.LastWritenByGpu; + } + } + + int getRegionOffset(Addr addr) { + if (blocksPerRegion > 1) { + Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1); + int ret := addressToInt(offset); + assert(ret < blocksPerRegion); + return ret; + } else { + return 0; + } + } + + Addr getRegionBase(Addr addr) { + return maskLowOrderBits(addr, blockBits+regionBits); + } + + Addr getNextBlock(Addr addr) { + Addr a := addr; + makeNextStrideAddress(a, 1); + return a; + } + + bool presentOrAvail(Addr addr) { + DPRINTF(RubySlicc, "Present? %s, avail? %s\n", cacheMemory.isTagPresent(getRegionBase(addr)), cacheMemory.cacheAvail(getRegionBase(addr))); + return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr)); + } + + // Returns a region entry! + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr))); + } + + TBE getTBE(Addr addr), return_by_pointer="yes" { + return TBEs.lookup(getRegionBase(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(getRegionBase(addr)).DataBlk; + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.RegionState; + } + return State:NP; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + if (is_valid(cache_entry)) { + cache_entry.RegionState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := getTBE(addr); + if(is_valid(tbe)) { + return RegionDir_State_to_permission(tbe.TBEState); + } + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return RegionDir_State_to_permission(cache_entry.RegionState); + } + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(RegionDir_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + functionalMemoryRead(pkt); + } + + int functionalWrite(Addr addr, Packet *pkt) { + if (functionalMemoryWrite(pkt)) { + return 1; + } else { + return 0; + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + cacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + cacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + out_port(requestNetwork_out, CPURequestMsg, requestToDir); + out_port(notifyNetwork_out, CPURequestMsg, notifyToRBuffer); + out_port(probeNetwork_out, NBProbeRequestMsg, probeToRBuffer); + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=2) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + assert(in_msg.addr == getRegionBase(in_msg.addr)); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := getTBE(in_msg.addr); + DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + assert(is_valid(tbe)); + trigger(Event:LastAck, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:InvRegion) { + assert(is_valid(tbe)); + trigger(Event:TriggerInv, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:DowngradeRegion) { + assert(is_valid(tbe)); + trigger(Event:TriggerDowngrade, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown trigger message"); + } + } + } + } + + in_port(responseNetwork_in, ResponseMsg, responseFromRBuffer, rank=1) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + assert(in_msg.addr == getRegionBase(in_msg.addr)); + assert(is_valid(tbe)); + if (in_msg.NotCached) { + trigger(Event:InvAckCoreNoShare, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:InvAckCore, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:PrivateAck) { + assert(in_msg.addr == getRegionBase(in_msg.addr)); + assert(is_valid(cache_entry)); + //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender)); + trigger(Event:CPUPrivateAck, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:RegionWbAck) { + //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender) == false); + assert(in_msg.addr == getRegionBase(in_msg.addr)); + trigger(Event:WritebackAck, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DirReadyAck) { + assert(is_valid(tbe)); + trigger(Event:DirReadyAck, getRegionBase(in_msg.addr), cache_entry, tbe); + } else { + error("Invalid response type"); + } + } + } + } + + // In from cores + // NOTE: We get the cache / TBE entry based on the region address, + // but pass the block address to the actions + in_port(requestNetwork_in, CPURequestMsg, requestFromRegBuf, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + //assert(in_msg.addr == getRegionBase(in_msg.addr)); + Addr address := getRegionBase(in_msg.addr); + DPRINTF(RubySlicc, "Got %s, base %s\n", in_msg.addr, address); + if (presentOrAvail(address)) { + TBE tbe := getTBE(address); + Entry cache_entry := getCacheEntry(address); + if (in_msg.Type == CoherenceRequestType:PrivateRequest) { + if (is_valid(cache_entry) && (cache_entry.Owner != in_msg.Requestor || + getState(tbe, cache_entry, address) == State:S)) { + trigger(Event:SendInv, address, cache_entry, tbe); + } else { + trigger(Event:PrivateRequest, address, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:SharedRequest) { + if (is_invalid(cache_entry)) { + // If no one has ever requested this region give private permissions + trigger(Event:PrivateRequest, address, cache_entry, tbe); + } else { + if (always_migrate || + (sym_migrate && symMigrate(cache_entry)) || + (asym_migrate && asymMigrate(cache_entry, in_msg.Requestor))) { + if (cache_entry.Sharers.count() == 1 && + cache_entry.Sharers.isElement(in_msg.Requestor)) { + trigger(Event:UpgradeRequest, address, cache_entry, tbe); + } else { + trigger(Event:SendInv, address, cache_entry, tbe); + } + } else { // don't migrate + if(cache_entry.Sharers.isElement(in_msg.Requestor) || + getState(tbe, cache_entry, address) == State:S) { + trigger(Event:SharedRequest, address, cache_entry, tbe); + } else { + trigger(Event:SendDowngrade, address, cache_entry, tbe); + } + } + } + } else if (in_msg.Type == CoherenceRequestType:UpgradeRequest) { + if (is_invalid(cache_entry)) { + trigger(Event:PrivateRequest, address, cache_entry, tbe); + } else if (cache_entry.Sharers.count() == 1 && cache_entry.Sharers.isElement(in_msg.Requestor)) { + trigger(Event:UpgradeRequest, address, cache_entry, tbe); + } else { + trigger(Event:SendUpgrade, address, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:CleanWbRequest) { + if (is_invalid(cache_entry) || cache_entry.Sharers.isElement(in_msg.Requestor) == false) { + trigger(Event:StaleCleanWbRequest, address, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "wb address %s(%s) owner %s sharers %s requestor %s %d %d\n", in_msg.addr, getRegionBase(in_msg.addr), cache_entry.Owner, cache_entry.Sharers, in_msg.Requestor, cache_entry.Sharers.isElement(in_msg.Requestor), cache_entry.Sharers.count()); + if (cache_entry.Sharers.isElement(in_msg.Requestor) && cache_entry.Sharers.count() == 1) { + DPRINTF(RubySlicc, "last wb\n"); + trigger(Event:CleanWbRequest_LastSharer, address, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "clean wb\n"); + trigger(Event:CleanWbRequest, address, cache_entry, tbe); + } + } + } else { + error("unknown region dir request type"); + } + } else { + Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr)); + TBE victim_tbe := getTBE(victim); + Entry victim_entry := getCacheEntry(victim); + DPRINTF(RubySlicc, "Evicting address %s for new region at address %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr)); + assert(is_valid(victim_entry)); + trigger(Event:Evict, victim, victim_entry, victim_tbe); + } + } + } + } + + // Actions + + action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) { + out_msg.Acks := cache_entry.Sharers.count(); + } else { + out_msg.Acks := 0; + } + } + } + } + + action(f_fwdReqToDirShared, "fs", desc="Forward CPU request to directory (shared)") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + out_msg.ForceShared := true; + if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) { + out_msg.Acks := cache_entry.Sharers.count(); + } else { + out_msg.Acks := 0; + } + } + } + } + + action(f_fwdReqToDirWithAck, "fa", desc="Forward CPU request to directory with ack request") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + if (is_valid(cache_entry)) { + out_msg.Acks := cache_entry.Sharers.count(); + // Don't need an ack from the requestor! + if (cache_entry.Sharers.isElement(in_msg.Requestor)) { + out_msg.Acks := out_msg.Acks - 1; + } + } else { + out_msg.Acks := 0; + } + } + } + } + + action(f_fwdReqToDirWithAckShared, "fas", desc="Forward CPU request to directory with ack request") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + out_msg.ForceShared := true; + if (is_valid(cache_entry)) { + out_msg.Acks := cache_entry.Sharers.count(); + // Don't need an ack from the requestor! + if (cache_entry.Sharers.isElement(in_msg.Requestor)) { + out_msg.Acks := out_msg.Acks - 1; + } + } else { + out_msg.Acks := 0; + } + } + } + } + + action(a_allocateRegionEntry, "a", desc="Allocate a new entry") { + set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry)); + peek(requestNetwork_in, CPURequestMsg) { + APPEND_TRANSITION_COMMENT(in_msg.Requestor); + } + } + + action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") { + cacheMemory.deallocate(getRegionBase(address)); + unset_cache_entry(); + } + + action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") { + //assert(tbe.ValidBlocks.at(getRegionOffset(address))); + DPRINTF(RubySlicc, "received ack for %s reg: %s\n", address, getRegionBase(address)); + tbe.NumValidBlocks := tbe.NumValidBlocks - 1; + assert(tbe.NumValidBlocks >= 0); + if (tbe.NumValidBlocks == 0) { + tbe.AllAcksReceived := true; + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := address; + } + } + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" Acks left receive "); + APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks); + } + + action(ca_checkAcks, "ca", desc="Check to see if we need more acks") { + if (tbe.NumValidBlocks == 0) { + tbe.AllAcksReceived := true; + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := address; + } + } + } + + action(ti_triggerInv, "ti", desc="") { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:InvRegion; + out_msg.addr := address; + } + } + + action(td_triggerDowngrade, "td", desc="") { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:DowngradeRegion; + out_msg.addr := address; + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(getRegionBase(address)); + set_tbe(getTBE(address)); + if (is_valid(cache_entry)) { + tbe.Owner := cache_entry.Owner; + tbe.Sharers := cache_entry.Sharers; + tbe.AllAcksReceived := true; // assume no acks are required + } + tbe.ProbeRequestTime := curCycle(); + peek(requestNetwork_in, CPURequestMsg) { + tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.DemandAddress := in_msg.addr; + } + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" Acks left "); + APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks); + APPEND_TRANSITION_COMMENT(" Owner, "); + APPEND_TRANSITION_COMMENT(tbe.Owner); + APPEND_TRANSITION_COMMENT(" sharers, "); + APPEND_TRANSITION_COMMENT(tbe.Sharers); + } + + action(ss_setSharers, "ss", desc="Add requestor to sharers") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.add(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(cache_entry.Sharers); + } + } + + action(rs_removeSharer, "rs", desc="Remove requestor to sharers") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(" removing "); + APPEND_TRANSITION_COMMENT(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(" sharers "); + APPEND_TRANSITION_COMMENT(cache_entry.Sharers); + } + } + + action(rsr_removeSharerResponse, "rsr", desc="Remove requestor to sharers") { + peek(responseNetwork_in, ResponseMsg) { + cache_entry.Sharers.remove(in_msg.Sender); + APPEND_TRANSITION_COMMENT(cache_entry.Sharers); + } + } + + action(cs_clearSharers, "cs", desc="Add requestor to sharers") { + cache_entry.Sharers.clear(); + } + + action(so_setOwner, "so", desc="Set the owner to the requestor") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.Owner := in_msg.Requestor; + APPEND_TRANSITION_COMMENT(" Owner now: "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + } + + action(rr_removeRequestorFromTBE, "rr", desc="Remove requestor from TBE sharers") { + peek(requestNetwork_in, CPURequestMsg) { + tbe.Sharers.remove(in_msg.Requestor); + } + } + + action(ur_updateDirtyStatusOnRequest, "ur", desc="Update dirty status on demand request") { + peek(requestNetwork_in, CPURequestMsg) { + if (is_valid(cache_entry)) { + if ((in_msg.Type == CoherenceRequestType:SharedRequest) && + (cache_entry.Sharers.isElement(in_msg.Requestor) == false)) { + cache_entry.LastWriten := false; + if (isCpuMachine(in_msg.Requestor)) { + cache_entry.LastWritenByCpu := false; + } else { + cache_entry.LastWritenByGpu := false; + } + } else if ((in_msg.Type == CoherenceRequestType:PrivateRequest) || + (in_msg.Type == CoherenceRequestType:UpgradeRequest)) { + cache_entry.LastWriten := true; + if (isCpuMachine(in_msg.Requestor)) { + cache_entry.LastWritenByCpu := true; + } else { + cache_entry.LastWritenByGpu := true; + } + } + } + } + } + + action(ud_updateDirtyStatusWithWb, "ud", desc="Update dirty status on writeback") { + peek(requestNetwork_in, CPURequestMsg) { + if (is_valid(cache_entry) && in_msg.Dirty) { + cache_entry.LastWriten := true; + if (isCpuMachine(in_msg.Requestor)) { + cache_entry.LastWritenByCpu := true; + } else { + cache_entry.LastWritenByGpu := true; + } + } + } + } + + action(sns_setNumAcksSharers, "sns", desc="Set number of acks to one per shared region buffer") { + assert(is_valid(tbe)); + assert(is_valid(cache_entry)); + tbe.NumValidBlocks := tbe.Sharers.count(); + } + + action(sno_setNumAcksOne, "sno", desc="Set number of acks to one per shared region buffer") { + assert(is_valid(tbe)); + assert(is_valid(cache_entry)); + tbe.NumValidBlocks := 1; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" reg: "); + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + unset_tbe(); + } + + action(wb_sendWbNotice, "wb", desc="Send notice to cache that writeback is acknowledged") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:WbNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + } + } + } + + action(wbn_sendWbNoticeNoAck, "wbn", desc="Send notice to cache that writeback is acknowledged (no ack needed)") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:WbNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.NoAckNeeded := true; + } + } + } + + action(b_sendPrivateNotice, "b", desc="Send notice to private cache that it has private access") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:PrivateNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + } + } + } + + action(bs_sendSharedNotice, "bs", desc="Send notice to private cache that it has private access") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:SharedNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + } + } + } + + action(c_sendSharedNoticeToOrigReq, "c", desc="Send notice to private cache that it has shared access") { + assert(is_valid(tbe)); + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:SharedNotify; + out_msg.Destination.add(tbe.Owner); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(sp_sendPrivateNoticeToOrigReq, "sp", desc="Send notice to private cache that it has private access") { + assert(is_valid(tbe)); + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:PrivateNotify; + out_msg.Destination.add(tbe.Owner); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(i_RegionInvNotify, "i", desc="Send notice to private cache that it no longer has private access") { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.DemandAddress := tbe.DemandAddress; + //out_msg.Requestor := tbe.Requestor; + out_msg.Requestor := machineID; + out_msg.Type := ProbeRequestType:PrbInv; + //Fix me: assert(tbe.Sharers.count() > 0); + out_msg.DemandRequest := true; + out_msg.Destination := tbe.Sharers; + out_msg.MessageSize := MessageSizeType:Request_Control; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(i0_RegionInvNotifyDemand0, "i0", desc="Send notice to private cache that it no longer has private access") { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + // Demand address should default to 0 -> out_msg.DemandAddress := 0; + out_msg.Requestor := machineID; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.Destination := tbe.Sharers; + out_msg.MessageSize := MessageSizeType:Request_Control; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(rd_RegionDowngrade, "rd", desc="Send notice to private cache that it only has shared access") { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.DemandAddress := tbe.DemandAddress; + out_msg.Requestor := machineID; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.DemandRequest := true; + out_msg.Destination := tbe.Sharers; + out_msg.MessageSize := MessageSizeType:Request_Control; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(p_popRequestQueue, "p", desc="Pop the request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop the response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(s_stallAndWaitRequest, "s", desc="Stall and wait on the region address") { + Addr regAddr := getRegionBase(address); + stall_and_wait(requestNetwork_in, regAddr); + } + + action(w_wakeUpRegionDependents, "w", desc="Wake up any requests waiting for this region") { + wakeUpBuffers(getRegionBase(address)); + } + + action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(zz_recycleRequestQueue, "\z", desc="...") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(z_stall, "z", desc="stall request queue") { + // fake state + } + + action(mru_setMRU, "mru", desc="set MRU") { + cacheMemory.setMRU(address); + } + + // Transistions + + transition({NP_P, P_P, NP_S, S_S, S_P, P_S, P_NP, S_AP, P_AS, P_AP, SP_NP_W, S_W, P_AP_W, P_AS_W, S_AP_W}, {PrivateRequest, SharedRequest, UpgradeRequest, SendInv, SendUpgrade, SendDowngrade, CleanWbRequest, CleanWbRequest_LastSharer, StaleCleanWbRequest}) { + s_stallAndWaitRequest + } + + transition({NP_P, P_P, NP_S, S_S, S_P, S_W, P_S, P_NP, S_AP, P_AS, P_AP, P_AP_W, P_AS_W, S_AP_W}, Evict) { + zz_recycleRequestQueue; + } + + transition(NP, {PrivateRequest, SendUpgrade}, NP_P) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDir; + b_sendPrivateNotice; + so_setOwner; + ss_setSharers; + t_allocateTBE; + p_popRequestQueue; + } + + transition(P, {PrivateRequest, UpgradeRequest}, P_P) {TagArrayRead} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDir; + b_sendPrivateNotice; + t_allocateTBE; + p_popRequestQueue; + } + + transition({NP_P, P_P}, CPUPrivateAck, P) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition({NP, P, S}, StaleCleanWbRequest) {TagArrayRead, TagArrayWrite} { + wbn_sendWbNoticeNoAck; + ud_updateDirtyStatusWithWb; + p_popRequestQueue; + } + + transition(NP, SharedRequest, NP_S) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirShared; + bs_sendSharedNotice; + so_setOwner; + ss_setSharers; + t_allocateTBE; + p_popRequestQueue; + } + + // Could probably do this in parallel with other shared requests + transition(S, SharedRequest, S_S) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirShared; + bs_sendSharedNotice; + ss_setSharers; + t_allocateTBE; + p_popRequestQueue; + } + + transition({P, S}, CleanWbRequest_LastSharer, SP_NP_W) {TagArrayRead, TagArrayWrite} { + ud_updateDirtyStatusWithWb; + wb_sendWbNotice; + rs_removeSharer; + t_allocateTBE; + d_deallocateRegionEntry; + p_popRequestQueue; + } + + transition(S, CleanWbRequest, S_W) {TagArrayRead, TagArrayWrite} { + ud_updateDirtyStatusWithWb; + wb_sendWbNotice; + rs_removeSharer; + t_allocateTBE; + p_popRequestQueue; + } + + transition(SP_NP_W, WritebackAck, NP) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition(S_W, WritebackAck, S) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition({NP_S, S_S}, CPUPrivateAck, S) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition(S, UpgradeRequest, S_P) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDir; + b_sendPrivateNotice; + so_setOwner; + t_allocateTBE; + p_popRequestQueue; + } + + transition(S_P, CPUPrivateAck, P) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition(P, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + //i_RegionInvNotify; + p_popRequestQueue; + } + + transition({P_AP_W, S_AP_W}, DirReadyAck) { + ti_triggerInv; + pr_popResponseQueue; + } + + transition(P_AS_W, DirReadyAck) { + td_triggerDowngrade; + pr_popResponseQueue; + } + + transition(P_AS_W, TriggerDowngrade, P_AS) { + rd_RegionDowngrade; + pt_popTriggerQueue; + } + + transition(P_AP_W, TriggerInv, P_AP) { + i_RegionInvNotify; + pt_popTriggerQueue; + } + + transition(S_AP_W, TriggerInv, S_AP) { + i_RegionInvNotify; + pt_popTriggerQueue; + } + + transition(P, SendUpgrade, P_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + p_popRequestQueue; + } + + transition(P, Evict, P_NP) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + sns_setNumAcksSharers; + i0_RegionInvNotifyDemand0; + d_deallocateRegionEntry; + } + + transition(S, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + p_popRequestQueue; + } + + transition(S, Evict, P_NP) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + sns_setNumAcksSharers; + i0_RegionInvNotifyDemand0; + d_deallocateRegionEntry; + } + + transition(P_NP, LastAck, NP) { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + + transition(S, SendUpgrade, S_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + p_popRequestQueue; + } + + transition(S_AP, LastAck, S_P) { + sp_sendPrivateNoticeToOrigReq; + pt_popTriggerQueue; + } + + transition(P_AP, LastAck, P_P) { + sp_sendPrivateNoticeToOrigReq; + pt_popTriggerQueue; + } + + transition(P, SendDowngrade, P_AS_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAckShared; + so_setOwner; + t_allocateTBE; + sns_setNumAcksSharers; + ss_setSharers; //why do we set the sharers before sending the downgrade? Are we sending a downgrade to the requestor? + p_popRequestQueue; + } + + transition(P_AS, LastAck, P_S) { + c_sendSharedNoticeToOrigReq; + pt_popTriggerQueue; + } + + transition(P_S, CPUPrivateAck, S) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition({P_NP, P_AS, S_AP, P_AP}, InvAckCore) {} { + ra_receiveAck; + pr_popResponseQueue; + } + + transition({P_NP, S_AP, P_AP}, InvAckCoreNoShare) {} { + ra_receiveAck; + pr_popResponseQueue; + } + + transition(P_AS, InvAckCoreNoShare) {} { + ra_receiveAck; + rsr_removeSharerResponse; + pr_popResponseQueue; + } + +} + + diff --git a/src/mem/protocol/MOESI_AMD_Base-dir.sm b/src/mem/protocol/MOESI_AMD_Base-dir.sm new file mode 100644 index 000000000..52cefda66 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-dir.sm @@ -0,0 +1,1137 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:Directory, "AMD Baseline protocol") +: DirectoryMemory * directory; + CacheMemory * L3CacheMemory; + Cycles response_latency := 5; + Cycles l3_hit_latency := 50; + bool noTCCdir := "False"; + bool CPUonly := "False"; + int TCC_select_num_bits; + bool useL3OnWT := "False"; + Cycles to_memory_controller_latency := 1; + + // From the Cores + MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response"; + + MessageBuffer * triggerQueue; + MessageBuffer * L3triggerQueue; + MessageBuffer * responseFromMemory; +{ + // STATES + state_declaration(State, desc="Directory states", default="Directory_State_U") { + U, AccessPermission:Backing_Store, desc="unblocked"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + // BL is Busy because it's possible for the data only to be in the network + // in the WB, L3 has sent it and gone on with its business in possibly I + // state. + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + } + + // Events + enumeration(Event, desc="Directory events") { + // CPU requests + RdBlkS, desc="..."; + RdBlkM, desc="..."; + RdBlk, desc="..."; + CtoD, desc="..."; + WriteThrough, desc="WriteThrough Message"; + Atomic, desc="Atomic Message"; + + // writebacks + VicDirty, desc="..."; + VicClean, desc="..."; + CPUData, desc="WB data from CPU"; + StaleWB, desc="Notification that WB has been superceded by a probe"; + + // probe responses + CPUPrbResp, desc="Probe Response Msg"; + + ProbeAcksComplete, desc="Probe Acks Complete"; + + L3Hit, desc="Hit in L3 return data to core"; + + // Memory Controller + MemData, desc="Fetched data from memory arrives"; + WBAck, desc="Writeback Ack from memory arrives"; + + CoreUnblock, desc="Core received data, unblock"; + UnblockWriteThrough, desc="Unblock because of writethrough request finishing"; + + StaleVicDirty, desc="Core invalidated before VicDirty processed"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L3DataArrayRead, desc="Read the data array"; + L3DataArrayWrite, desc="Write the data array"; + L3TagArrayRead, desc="Read the data array"; + L3TagArrayWrite, desc="Write the data array"; + } + + // TYPES + + // DirectoryEntry + structure(Entry, desc="...", interface="AbstractEntry") { + State DirectoryState, desc="Directory state"; + DataBlock DataBlk, desc="data for the block"; + NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; + } + + structure(CacheEntry, desc="...", interface="AbstractCacheEntry") { + DataBlock DataBlk, desc="data for the block"; + MachineID LastSender, desc="Mach which this block came from"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID WTRequestor, desc="WT Requestor"; + bool Cached, desc="data hit in Cache"; + bool MemData, desc="Got MemData?",default="false"; + bool wtData, desc="Got write through data?",default="false"; + bool atomicData, desc="Got Atomic op?",default="false"; + Cycles InitialRequestTime, desc="..."; + Cycles ForwardRequestTime, desc="..."; + Cycles ProbeRequestStartTime, desc="..."; + MachineID LastSender, desc="Mach which this block came from"; + bool L3Hit, default="false", desc="Was this an L3 hit?"; + uint64_t probe_id, desc="probe id for lifetime profiling"; + WriteMask writeMask, desc="outstanding write through mask"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_tbe(TBE a); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" { + Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr)); + + if (is_valid(dir_entry)) { + return dir_entry; + } + + dir_entry := static_cast(Entry, "pointer", + directory.allocate(addr, new Entry)); + return dir_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if (is_valid(tbe) && tbe.MemData) { + DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); + return tbe.DataBlk; + } + DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); + return getDirectoryEntry(addr).DataBlk; + } + + State getState(TBE tbe, CacheEntry entry, Addr addr) { + return getDirectoryEntry(addr).DirectoryState; + } + + void setState(TBE tbe, CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).DirectoryState := state; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + // For this Directory, all permissions are just tracked in Directory, since + // it's not possible to have something in TBE but not Dir, just keep track + // of state all in one place. + if (directory.isPresent(addr)) { + return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state)); + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // ** OUT_PORTS ** + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); + out_port(responseNetwork_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe); + }else if (in_msg.Type == TriggerType:UnblockWriteThrough) { + trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) { + if (L3TriggerQueue_in.isReady(clockEdge())) { + peek(L3TriggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:L3Hit) { + trigger(Event:L3Hit, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + // Unblock Network + in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + trigger(Event:CoreUnblock, in_msg.addr, entry, tbe); + } + } + } + + // Core response network + in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:CPUData) { + trigger(Event:CPUData, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, entry, tbe); + } else { + error("Unexpected response type"); + } + } + } + } + + // off-chip memory request/response is done + in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) { + if (memQueue_in.isReady(clockEdge())) { + peek(memQueue_in, MemoryMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == MemoryRequestType:MEMORY_READ) { + trigger(Event:MemData, in_msg.addr, entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { + trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + } else { + DPRINTF(RubySlicc, "%s\n", in_msg.Type); + error("Invalid message"); + } + } + } + } + + in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThrough, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicDirty, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicClean, in_msg.addr, entry, tbe); + } + } else { + error("Bad request message type"); + } + } + } + } + + // Actions + action(s_sendResponseS, "s", desc="send Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(m_sendResponseM, "m", desc="send Modified response") { + if (tbe.wtData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + }else{ + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + if(tbe.atomicData){ + out_msg.WTRequestor := tbe.WTRequestor; + } + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + if (tbe.atomicData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + } + + action(c_sendResponseCtoD, "c", desc="send CtoD Ack") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + } + } + } + + action(l_queueMemWBReq, "lq", desc="Write WB data to memory") { + peek(responseNetwork_in, ResponseMsg) { + queueMemoryWrite(machineID, address, to_memory_controller_latency, + in_msg.DataBlk); + } + } + + action(l_queueMemRdReq, "lr", desc="Read data from memory") { + peek(requestNetwork_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + if (tbe.Dirty == false) { + tbe.DataBlk := entry.DataBlk; + } + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + + // add relevant TCC node to list. This replaces all TCPs and SQCs + if (((in_msg.Type == CoherenceRequestType:WriteThrough || + in_msg.Type == CoherenceRequestType:Atomic) && + in_msg.NoWriteConflict) || + CPUonly) { + } else if (noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + out_msg.Destination.add(mapAddressToRange(address, + MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if (noTCCdir || CPUonly) { + //Don't need to notify TCC about reads + } else { + out_msg.Destination.add(mapAddressToRange(address, + MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + } + if (noTCCdir && !CPUonly) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if (noTCCdir && !CPUonly) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + if (!noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address, + MachineType:TCCdir, + TCC_select_low_bit, + TCC_select_num_bits)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(d_writeDataToMemory, "d", desc="Write data to memory") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + if (tbe.Dirty == false) { + // have to update the TBE, too, because of how this + // directory deals with functional writes + tbe.DataBlk := in_msg.DataBlk; + } + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(requestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); + tbe.Dirty := true; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + } + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + TBEs.deallocate(address); + unset_tbe(); + } + + action(wd_writeBackData, "wd", desc="Write back data if needed") { + if (tbe.wtData) { + getDirectoryEntry(address).DataBlk.copyPartial(tbe.DataBlk, tbe.writeMask); + } else if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + } + + action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { + peek(memQueue_in, MemoryMsg) { + if (tbe.wtData == true) { + // do nothing + } else if (tbe.Dirty == false) { + tbe.DataBlk := getDirectoryEntry(address).DataBlk; + } + tbe.MemData := true; + } + } + + action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + if (tbe.wtData) { + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + tbe.writeMask.fillMask(); + } else if (tbe.Dirty) { + if(tbe.atomicData == false && tbe.wtData == false) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + } else { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.LastSender := in_msg.Sender; + } + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty "); + } + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + APPEND_TRANSITION_COMMENT(" Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(o_checkForCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") { + peek(requestNetwork_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { + peek(responseNetwork_in, ResponseMsg) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + + entry.LastSender := in_msg.Sender; + } + } + } + + action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { + if ((tbe.wtData || tbe.atomicData) && useL3OnWT) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(sf_setForwardReqTime, "sf", desc="...") { + tbe.ForwardRequestTime := curCycle(); + } + + action(dl_deallocateL3, "dl", desc="deallocate the L3 block") { + L3CacheMemory.deallocate(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pm_popMemQueue, "pm", desc="pop mem queue") { + memQueue_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") { + L3TriggerQueue_in.dequeue(clockEdge()); + } + + action(pu_popUnblockQueue, "pu", desc="pop unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "zz", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(yy_recycleResponseQueue, "yy", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(requestNetwork_in, address); + } + + action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") { + wakeUpBuffers(address); + } + + action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(z_stall, "z", desc="...") { + } + + // TRANSITIONS + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + st_stallAndWaitRequest; + } + + // It may be possible to save multiple invalidations here! + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) { + st_stallAndWaitRequest; + } + + + // transitions from U + transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sc_probeShrCoreData; + p_popRequestQueue; + } + + transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite} { + t_allocateTBE; + w_sendResponseWBAck; + l_queueMemRdReq; + dc_probeInvCoreData; + p_popRequestQueue; + } + + transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite} { + t_allocateTBE; + l_queueMemRdReq; + dc_probeInvCoreData; + p_popRequestQueue; + } + + transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + dc_probeInvCoreData; + p_popRequestQueue; + } + + transition(U, RdBlk, B_PM) {L3TagArrayRead}{ + t_allocateTBE; + l_queueMemRdReq; + sc_probeShrCoreData; + p_popRequestQueue; + } + + transition(U, CtoD, BP) {L3TagArrayRead} { + t_allocateTBE; + ic_probeInvCore; + p_popRequestQueue; + } + + transition(U, VicDirty, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(U, VicClean, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(BL, {VicDirty, VicClean}) { + zz_recycleRequestQueue; + } + + transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + d_writeDataToMemory; + al_allocateL3Block; + wa_wakeUpDependents; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BL, StaleWB, U) {L3TagArrayWrite} { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pr_popResponseQueue; + } + + transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { + z_stall; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, WBAck) { + pm_popMemQueue; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) { + rv_removeVicDirtyIgnore; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({B}, CoreUnblock, U) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(B, UnblockWriteThrough, U) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_PM, MemData, BS_Pm) {} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BM_PM, MemData, BM_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(B_PM, MemData, B_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BS_PM, L3Hit, BS_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BM_PM, L3Hit, BM_Pm) {} { + ptl_popTriggerQueue; + } + + transition(B_PM, L3Hit, B_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) { + y_writeProbeDataToTBE; + x_decrementAcks; + o_checkForCompletion; + pr_popResponseQueue; + } + + transition(BS_PM, ProbeAcksComplete, BS_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BM_PM, ProbeAcksComplete, BM_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(B_PM, ProbeAcksComplete, B_M){} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } +} diff --git a/src/mem/protocol/MOESI_AMD_Base-msg.sm b/src/mem/protocol/MOESI_AMD_Base-msg.sm new file mode 100644 index 000000000..ff8842369 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-msg.sm @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + + +enumeration(CoherenceRequestType, desc="Coherence Request Types") { + // CPU Request Types ONLY + RdBlk, desc="Read Blk"; + RdBlkM, desc="Read Blk Modified"; + RdBlkS, desc="Read Blk Shared"; + CtoD, desc="Change To Dirty"; + VicClean, desc="L2 clean eviction"; + VicDirty, desc="L2 dirty eviction"; + Atomic, desc="Upper level atomic"; + AtomicWriteBack, desc="Upper level atomic"; + WriteThrough, desc="Ordered WriteThrough w/Data"; + WriteThroughFifo, desc="WriteThrough with no data"; + WriteThroughDummy, desc="WriteThrough with no data for atomic operation"; + WriteFlush, desc="Release Flush"; + + WrCancel, desc="want to cancel WB to Memory"; // should this be here? + + WBApproval, desc="WB Approval"; + + // Messages between Dir and R-Dir + ForceInv, desc="Send invalide to the block"; + ForceDowngrade, desc="Send downgrade to the block"; + Unblock, desc="Used to let the dir know a message has been sunk"; + + // Messages between R-Dir and R-Buffer + PrivateNotify, desc="Let region buffer know it has private access"; + SharedNotify, desc="Let region buffer know it has shared access"; + WbNotify, desc="Let region buffer know it saw its wb request"; + Downgrade, desc="Force the region buffer to downgrade to shared"; + // Response to R-Dir (probably should be on a different network, but + // I need it to be ordered with respect to requests) + InvAck, desc="Let the R-Dir know when the inv has occured"; + + PrivateRequest, desc="R-buf wants the region in private"; + UpgradeRequest, desc="R-buf wants the region in private"; + SharedRequest, desc="R-buf wants the region in shared (could respond with private)"; + CleanWbRequest, desc="R-buf wants to deallocate clean region"; + + NA, desc="So we don't get segfaults"; +} + +enumeration(ProbeRequestType, desc="Probe Request Types") { + PrbDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS + PrbInv, desc="Probe to Invalidate"; + + // For regions + PrbRepl, desc="Force the cache to do a replacement"; + PrbRegDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS + PrbAtomic, desc="Forwarded Atomic Operation"; +} + + +enumeration(CoherenceResponseType, desc="Coherence Response Types") { + NBSysResp, desc="Northbridge response to CPU Rd request"; + NBSysWBAck, desc="Northbridge response ok to WB"; + TDSysResp, desc="TCCdirectory response to CPU Rd request"; + TDSysWBAck, desc="TCCdirectory response ok to WB"; + TDSysWBNack, desc="TCCdirectory response ok to drop"; + CPUPrbResp, desc="CPU Probe Response"; + CPUData, desc="CPU Data"; + StaleNotif, desc="Notification of Stale WBAck, No data to writeback"; + CPUCancelWB, desc="want to cancel WB to Memory"; + MemData, desc="Data from Memory"; + + // for regions + PrivateAck, desc="Ack that r-buf received private notify"; + RegionWbAck, desc="Writeback Ack that r-buf completed deallocation"; + DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake"; +} + +enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") { + Modified, desc="Modified"; + Owned, desc="Owned state"; + Exclusive, desc="Exclusive"; + Shared, desc="Shared"; + NA, desc="NA"; +} + +structure(CPURequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + Addr DemandAddress, desc="Physical block address for this request"; + CoherenceRequestType Type, desc="Type of request"; + DataBlock DataBlk, desc="data for the cache line"; // only for WB + bool Dirty, desc="whether WB data is dirty"; // only for WB + MachineID Requestor, desc="Node who initiated the request"; + NetDest Destination, desc="Multicast destination mask"; + bool Shared, desc="For CPU_WrVicBlk, vic is O not M. For CPU_ClVicBlk, vic is S"; + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + NetDest Sharers, desc="Caches that may have a valid copy of the data"; + bool ForceShared, desc="R-dir knows it is shared, pass on so it sends an S copy, not E"; + bool Private, default="false", desc="Requestor already has private permissions, no need for dir check"; + bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk"; + + bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack"; + int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive"; + CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer"; + WriteMask writeMask, desc="Write Through Data"; + MachineID WTRequestor, desc="Node who initiated the write through"; + HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope"; + int wfid, default="0", desc="wavefront id"; + bool NoWriteConflict, default="true", desc="write collided with CAB entry"; + int ProgramCounter, desc="PC that accesses to this block"; + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceRequestType:VicDirty) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(NBProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="NB_PrbNxtState signal"; + bool ReturnData, desc="Indicates CPU should return data"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer"; + Addr DemandAddress, desc="Demand block address for a region request"; + MachineID Requestor, desc="Requestor id for 3-hop requests"; + bool NoAckNeeded, default="false", desc="For short circuting acks"; + int ProgramCounter, desc="PC that accesses to this block"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} + +structure(TDProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="TD_PrbNxtState signal"; + bool ReturnData, desc="Indicates CPU should return data"; + bool localCtoD, desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + int Phase, desc="Synchronization Phase"; + int wfid, desc="wavefront id for Release"; + MachineID Requestor, desc="Node who initiated the request"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } +} + +// Response Messages seemed to be easily munged into one type +structure(ResponseMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + CoherenceResponseType Type, desc="NB Sys Resp or CPU Response to Probe"; + MachineID Sender, desc="Node who sent the data"; + NetDest Destination, desc="Node to whom the data is sent"; + // Begin Used Only By CPU Response + DataBlock DataBlk, desc="data for the cache line"; + bool Hit, desc="probe hit valid line"; + bool Shared, desc="True if S, or if NB Probe ReturnData==1 && O"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + bool Ntsl, desc="indicates probed lin will be invalid after probe"; + bool UntransferredOwner, desc="pending confirmation of ownership change"; + // End Used Only By CPU Response + + // Begin NB Response Only + CoherenceState State, default=CoherenceState_NA, desc="What returned data from NB should be in"; + bool CtoD, desc="was the originator a CtoD?"; + // End NB Response Only + + // Normally if a block gets hit by a probe while waiting to be written back, + // you flip the NbReqShared signal (part of the CPURequest signal group). + // But since this is in packets and I don't want to send a separate packet, + // let's just send this signal back with the data instead + bool NbReqShared, desc="modification of Shared field from initial request, e.g. hit by shared probe"; + + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + bool L3Hit, default="false", desc="Did memory or L3 supply the data?"; + MachineID OriginalResponder, desc="Mach which wrote the data to the L3"; + MachineID WTRequestor, desc="Node who started the writethrough"; + + bool NotCached, default="false", desc="True when the Region buffer has already evicted the line"; + + bool NoAckNeeded, default="false", desc="For short circuting acks"; + bool isValid, default="false", desc="Is acked block valid"; + int wfid, default="0", desc="wavefront id"; + int Phase, desc="Synchronization Phase"; + + int ProgramCounter, desc="PC that issues this request"; + bool mispred, desc="tell TCP if the block should not be bypassed"; + + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceResponseType:CPUData || + Type == CoherenceResponseType:MemData) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(UnblockMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + NetDest Destination, desc="Destination (always directory)"; + MessageSizeType MessageSize, desc="size category of the message"; + MachineID Sender, desc="Node who sent the data"; + bool currentOwner, default="false", desc="Is the sender the current owner"; + bool DoneAck, default="false", desc="Is this a done ack?"; + bool Dirty, default="false", desc="Was block dirty when evicted"; + bool wasValid, default="false", desc="Was block valid when evicted"; + bool valid, default="false", desc="Is block valid"; + bool validToInvalid, default="false", desc="Was block valid when evicted"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } +} + +enumeration(TriggerType, desc="Trigger Type") { + L2_to_L1, desc="L2 to L1 fill"; + AcksComplete, desc="NB received all needed Acks"; + + // For regions + InvNext, desc="Invalidate the next block"; + PrivateAck, desc="Loopback ack for machines with no Region Buffer"; + AllOutstanding, desc="All outstanding requests have finished"; + L3Hit, desc="L3 hit in dir"; + + // For region directory once the directory is blocked + InvRegion, desc="Invalidate region"; + DowngradeRegion, desc="downgrade region"; + //For writethrough + UnblockWriteThrough, desc="unblock"; + WriteData, desc="Write to full cacheblock data"; + WriteDone, desc="Sequencer says that write is done"; + AtomicDone, desc="Atomic is done"; +} + +enumeration(CacheId, desc="Which Cache in the Core") { + L1I, desc="L1 I-cache"; + L1D0, desc="L1 D-cache cluster 0"; + L1D1, desc="L1 D-cache cluster 1"; + NA, desc="Default"; +} + +structure(TriggerMsg, desc="...", interface="Message") { + Addr addr, desc="Address"; + TriggerType Type, desc="Type of trigger"; + CacheId Dest, default="CacheId_NA", desc="Cache to invalidate"; + int ProgramCounter, desc="PC that accesses to this block"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} + +enumeration(FifoType, desc="Fifo Type") { + WriteDummy, desc="Dummy Write for atomic operation"; + WriteThrough, desc="simple writethrough request"; + WriteFlush, desc="synchronization message"; +} + +structure(FifoMsg, desc="...", interface="Message") { + Addr addr, desc="Address"; + FifoType Type, desc="WriteThrough/WriteFlush"; + int wfid, default="0",desc="wavefront id"; + MachineID Requestor, desc="Flush Requestor"; + MachineID oRequestor, desc="original Flush Requestor"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm new file mode 100644 index 000000000..f545c2fa7 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm @@ -0,0 +1,1408 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu, Sooraj Puthoor + */ + +/* + * This file is based on MOESI_AMD_Base.sm + * Differences with AMD base protocol + * -- Uses a probe filter memory to track sharers. + * -- The probe filter can be inclusive or non-inclusive + * -- Only two sharers tracked. Sharers are a) GPU or/and b) CPU + * -- If sharer information available, the sharer is probed + * -- If sharer information not available, probes are broadcasted + */ + +machine(MachineType:Directory, "AMD Baseline protocol") +: DirectoryMemory * directory; + CacheMemory * L3CacheMemory; + CacheMemory * ProbeFilterMemory; + Cycles response_latency := 5; + Cycles l3_hit_latency := 50; + bool noTCCdir := "False"; + bool CAB_TCC := "False"; + int TCC_select_num_bits:=1; + bool useL3OnWT := "False"; + bool inclusiveDir := "True"; + Cycles to_memory_controller_latency := 1; + + // From the Cores + MessageBuffer * requestFromCores, network="From", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseFromCores, network="From", virtual_network="2", ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCores, network="From", virtual_network="4", ordered="false", vnet_type="unblock"; + + MessageBuffer * probeToCore, network="To", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="2", ordered="false", vnet_type="response"; + + MessageBuffer * triggerQueue, ordered="true"; + MessageBuffer * L3triggerQueue, ordered="true"; + MessageBuffer * responseFromMemory; +{ + // STATES + state_declaration(State, desc="Directory states", default="Directory_State_U") { + U, AccessPermission:Backing_Store, desc="unblocked"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + // BL is Busy because it is busy waiting for the data + // which is possibly in the network. The cache which evicted the data + // might have moved to some other state after doing the eviction + // BS==> Received a read request; has not requested ownership + // B==> Received a read request; has requested ownership + // BM==> Received a modification request + B_P, AccessPermission:Backing_Store, desc="Back invalidation, waiting for probes"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + } + + // Events + enumeration(Event, desc="Directory events") { + // CPU requests + RdBlkS, desc="..."; + RdBlkM, desc="..."; + RdBlk, desc="..."; + CtoD, desc="..."; + WriteThrough, desc="WriteThrough Message"; + Atomic, desc="Atomic Message"; + + // writebacks + VicDirty, desc="..."; + VicClean, desc="..."; + CPUData, desc="WB data from CPU"; + StaleWB, desc="Notification that WB has been superceded by a probe"; + + // probe responses + CPUPrbResp, desc="Probe Response Msg"; + + ProbeAcksComplete, desc="Probe Acks Complete"; + + L3Hit, desc="Hit in L3 return data to core"; + + // Replacement + PF_Repl, desc="Replace address from probe filter"; + + // Memory Controller + MemData, desc="Fetched data from memory arrives"; + WBAck, desc="Writeback Ack from memory arrives"; + + CoreUnblock, desc="Core received data, unblock"; + UnblockWriteThrough, desc="Unblock because of writethrough request finishing"; + + StaleVicDirty, desc="Core invalidated before VicDirty processed"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L3DataArrayRead, desc="Read the data array"; + L3DataArrayWrite, desc="Write the data array"; + L3TagArrayRead, desc="Read the data array"; + L3TagArrayWrite, desc="Write the data array"; + + PFTagArrayRead, desc="Read the data array"; + PFTagArrayWrite, desc="Write the data array"; + } + + // TYPES + + enumeration(ProbeFilterState, desc="") { + T, desc="Tracked"; + NT, desc="Not tracked"; + B, desc="Blocked, This entry is being replaced"; + } + + // DirectoryEntry + structure(Entry, desc="...", interface="AbstractEntry") { + State DirectoryState, desc="Directory state"; + DataBlock DataBlk, desc="data for the block"; + NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; + } + + structure(CacheEntry, desc="...", interface="AbstractCacheEntry") { + DataBlock DataBlk, desc="data for the block"; + MachineID LastSender, desc="Mach which this block came from"; + ProbeFilterState pfState, desc="ProbeFilter state",default="Directory_ProbeFilterState_NT"; + bool isOnCPU, desc="Block valid in the CPU complex",default="false"; + bool isOnGPU, desc="Block valid in the GPU complex",default="false"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID WTRequestor, desc="WT Requestor"; + bool Cached, desc="data hit in Cache"; + bool MemData, desc="Got MemData?",default="false"; + bool wtData, desc="Got write through data?",default="false"; + bool atomicData, desc="Got Atomic op?",default="false"; + Cycles InitialRequestTime, desc="..."; + Cycles ForwardRequestTime, desc="..."; + Cycles ProbeRequestStartTime, desc="..."; + MachineID LastSender, desc="Mach which this block came from"; + bool L3Hit, default="false", desc="Was this an L3 hit?"; + uint64_t probe_id, desc="probe id for lifetime profiling"; + WriteMask writeMask, desc="outstanding write through mask"; + Addr demandAddress, desc="Address of demand request which caused probe filter eviction"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_tbe(TBE a); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" { + Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr)); + + if (is_valid(dir_entry)) { + //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk); + return dir_entry; + } + + dir_entry := static_cast(Entry, "pointer", + directory.allocate(addr, new Entry)); + return dir_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if (is_valid(tbe) && tbe.MemData) { + DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); + return tbe.DataBlk; + } + DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); + return getDirectoryEntry(addr).DataBlk; + } + + State getState(TBE tbe, CacheEntry entry, Addr addr) { + CacheEntry probeFilterEntry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(addr)); + if (inclusiveDir) { + if (is_valid(probeFilterEntry) && probeFilterEntry.pfState == ProbeFilterState:B) { + return State:B_P; + } + } + return getDirectoryEntry(addr).DirectoryState; + } + + void setState(TBE tbe, CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).DirectoryState := state; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + // For this Directory, all permissions are just tracked in Directory, since + // it's not possible to have something in TBE but not Dir, just keep track + // of state all in one place. + if (directory.isPresent(addr)) { + return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state)); + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:PFTagArrayRead) { + ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:PFTagArrayWrite) { + ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:PFTagArrayRead) { + return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:PFTagArrayWrite) { + return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + bool isNotPresentProbeFilter(Addr address) { + if (ProbeFilterMemory.isTagPresent(address) || + ProbeFilterMemory.cacheAvail(address)) { + return false; + } + return true; + } + + bool isGPUSharer(Addr address) { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if (entry.pfState == ProbeFilterState:NT) { + return true; + } else if (entry.isOnGPU){ + return true; + } + return false; + } + + bool isCPUSharer(Addr address) { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if (entry.pfState == ProbeFilterState:NT) { + return true; + } else if (entry.isOnCPU){ + return true; + } + return false; + } + + + // ** OUT_PORTS ** + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); + out_port(responseNetwork_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe); + }else if (in_msg.Type == TriggerType:UnblockWriteThrough) { + trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) { + if (L3TriggerQueue_in.isReady(clockEdge())) { + peek(L3TriggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:L3Hit) { + trigger(Event:L3Hit, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + // Unblock Network + in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + trigger(Event:CoreUnblock, in_msg.addr, entry, tbe); + } + } + } + + // Core response network + in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:CPUData) { + trigger(Event:CPUData, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, entry, tbe); + } else { + error("Unexpected response type"); + } + } + } + } + + // off-chip memory request/response is done + in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) { + if (memQueue_in.isReady(clockEdge())) { + peek(memQueue_in, MemoryMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == MemoryRequestType:MEMORY_READ) { + trigger(Event:MemData, in_msg.addr, entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { + trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + } else { + DPRINTF(RubySlicc, "%s\n", in_msg.Type); + error("Invalid message"); + } + } + } + } + + in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (inclusiveDir && isNotPresentProbeFilter(in_msg.addr)) { + Addr victim := ProbeFilterMemory.cacheProbe(in_msg.addr); + tbe := TBEs.lookup(victim); + entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(victim)); + trigger(Event:PF_Repl, victim, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThrough, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicDirty, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicClean, in_msg.addr, entry, tbe); + } + } else { + error("Bad request message type"); + } + } + } + } + + // Actions + action(s_sendResponseS, "s", desc="send Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + // write-through and atomics do not send an unblock ack back to the + // directory. Hence, directory has to generate a self unblocking + // message. Additionally, write through's does not require data + // in its response. Hence, write through is treated seperately from + // write-back and atomics + action(m_sendResponseM, "m", desc="send Modified response") { + if (tbe.wtData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + }else{ + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + if(tbe.atomicData){ + out_msg.WTRequestor := tbe.WTRequestor; + } + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + if (tbe.atomicData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + } + + action(c_sendResponseCtoD, "c", desc="send CtoD Ack") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + } + } + } + + action(l_queueMemWBReq, "lq", desc="Write WB data to memory") { + peek(responseNetwork_in, ResponseMsg) { + queueMemoryWrite(machineID, address, to_memory_controller_latency, + in_msg.DataBlk); + } + } + + action(l_queueMemRdReq, "lr", desc="Read data from memory") { + peek(requestNetwork_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + } + + // add relevant TCC node to list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if ((in_msg.Type == CoherenceRequestType:WriteThrough || + in_msg.Type == CoherenceRequestType:Atomic) && + in_msg.NoWriteConflict) { + // Don't Include TCCs unless there was write-CAB conflict in the TCC + } else if(noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(bp_backProbe, "bp", desc="back probe") { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + // won't be realistic for multisocket + out_msg.Destination.broadcast(MachineType:CorePair); + } + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if (noTCCdir) { + //Don't need to notify TCC about reads + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + } + if (noTCCdir && CAB_TCC) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } + } + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + APPEND_TRANSITION_COMMENT(" - back probe"); + tbe.ProbeRequestStartTime := curCycle(); + } + } + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + } + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if (noTCCdir) { + //Don't need to notify TCC about reads + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + } + if (noTCCdir && CAB_TCC) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + } + + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if (noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(sm_setMRU, "sm", desc="set probe filter entry as MRU") { + ProbeFilterMemory.setMRU(address); + } + + action(d_writeDataToMemory, "d", desc="Write data to memory") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk, + in_msg.addr); + } + } + + action(te_allocateTBEForEviction, "te", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.writeMask.clear(); + tbe.wtData := false; + tbe.atomicData := false; + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.NumPendingAcks := 0; + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(requestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask); + tbe.Dirty := false; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + } + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + TBEs.deallocate(address); + unset_tbe(); + } + + action(wd_writeBackData, "wd", desc="Write back data if needed") { + if (tbe.wtData) { + DataBlock tmp := getDirectoryEntry(address).DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk, + tbe.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + } + + action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { + peek(memQueue_in, MemoryMsg) { + if (tbe.wtData == true) { + // DO Nothing (already have the directory data) + } else if (tbe.Dirty == false) { + tbe.DataBlk := getDirectoryEntry(address).DataBlk; + } + tbe.MemData := true; + } + } + + action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender); + DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk); + if (tbe.wtData) { + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + } else if (tbe.Dirty) { + if(tbe.atomicData == false && tbe.wtData == false) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + } else { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.LastSender := in_msg.Sender; + } + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") { + peek(responseNetwork_in, ResponseMsg) { + DPRINTF(RubySlicc, "Write cancel bit set on address %s\n", address); + getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty "); + } + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + APPEND_TRANSITION_COMMENT(" Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(o_checkForCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") { + peek(requestNetwork_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { + peek(responseNetwork_in, ResponseMsg) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + + entry.LastSender := in_msg.Sender; + } + } + } + + action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { + if ((tbe.wtData || tbe.atomicData) && useL3OnWT) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(apf_allocateProbeFilterEntry, "apf", desc="Allocate probe filte entry") { + if (!ProbeFilterMemory.isTagPresent(address)) { + if (inclusiveDir) { + assert(ProbeFilterMemory.cacheAvail(address)); + } else if (ProbeFilterMemory.cacheAvail(address) == false) { + Addr victim := ProbeFilterMemory.cacheProbe(address); + ProbeFilterMemory.deallocate(victim); + } + assert(ProbeFilterMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" allocating a new probe filter entry"); + entry.pfState := ProbeFilterState:NT; + if (inclusiveDir) { + entry.pfState := ProbeFilterState:T; + } + entry.isOnCPU := false; + entry.isOnGPU := false; + } + } + + action(mpfe_markPFEntryForEviction, "mpfe", desc="Mark this PF entry is being evicted") { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + entry.pfState := ProbeFilterState:B; + peek(requestNetwork_in, CPURequestMsg) { + tbe.demandAddress := in_msg.addr; + } + } + + action(we_wakeUpEvictionDependents, "we", desc="Wake up requests waiting for demand address and victim address") { + wakeUpBuffers(address); + wakeUpBuffers(tbe.demandAddress); + } + + action(dpf_deallocateProbeFilter, "dpf", desc="deallocate PF entry") { + assert(ProbeFilterMemory.isTagPresent(address)); + ProbeFilterMemory.deallocate(address); + } + + action(upf_updateProbeFilter, "upf", desc="") { + peek(requestNetwork_in, CPURequestMsg) { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } else if (in_msg.Type == CoherenceRequestType:CtoD) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } + if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) { + entry.isOnCPU := true; + } else { + entry.isOnGPU := true; + } + } + } + + action(rmcd_removeSharerConditional, "rmcd", desc="remove sharer from probe Filter, conditional") { + peek(requestNetwork_in, CPURequestMsg) { + if (ProbeFilterMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {//CorePair has inclusive L2 + if (in_msg.Type == CoherenceRequestType:VicDirty) { + entry.isOnCPU := false; + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + entry.isOnCPU := false; + } + } + } + } + } + + action(sf_setForwardReqTime, "sf", desc="...") { + tbe.ForwardRequestTime := curCycle(); + } + + action(dl_deallocateL3, "dl", desc="deallocate the L3 block") { + L3CacheMemory.deallocate(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pm_popMemQueue, "pm", desc="pop mem queue") { + memQueue_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") { + L3TriggerQueue_in.dequeue(clockEdge()); + } + + action(pu_popUnblockQueue, "pu", desc="pop unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "zz", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(yy_recycleResponseQueue, "yy", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(requestNetwork_in, address); + } + + action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") { + wakeUpBuffers(address); + } + + action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(z_stall, "z", desc="...") { + } + + // TRANSITIONS + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + st_stallAndWaitRequest; + } + + // It may be possible to save multiple invalidations here! + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {Atomic, WriteThrough}) { + st_stallAndWaitRequest; + } + + + // transitions from U + transition(U, PF_Repl, B_P) {PFTagArrayRead, PFTagArrayWrite}{ + te_allocateTBEForEviction; + apf_allocateProbeFilterEntry; + bp_backProbe; + sm_setMRU; + mpfe_markPFEntryForEviction; + } + + transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + sc_probeShrCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + w_sendResponseWBAck; + l_queueMemRdReq; + dc_probeInvCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + dc_probeInvCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + dc_probeInvCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, RdBlk, B_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite}{ + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + sc_probeShrCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, CtoD, BP) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + ic_probeInvCore; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, VicDirty, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + rmcd_removeSharerConditional; + p_popRequestQueue; + } + + transition(U, VicClean, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + rmcd_removeSharerConditional; + p_popRequestQueue; + } + + transition(BL, {VicDirty, VicClean}) { + zz_recycleRequestQueue; + } + + transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + d_writeDataToMemory; + al_allocateL3Block; + wa_wakeUpDependents; + dt_deallocateTBE; + //l_queueMemWBReq; // why need an ack? esp. with DRAMSim, just put it in queue no ack needed + pr_popResponseQueue; + } + + transition(BL, StaleWB, U) {L3TagArrayWrite} { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pr_popResponseQueue; + } + + transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P}, {VicDirty, VicClean}) { + z_stall; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, WBAck) { + pm_popMemQueue; + } + + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, PF_Repl) { + zz_recycleRequestQueue; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, StaleVicDirty) { + rv_removeVicDirtyIgnore; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({B}, CoreUnblock, U) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(B, UnblockWriteThrough, U) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_PM, MemData, BS_Pm) {} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BM_PM, MemData, BM_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(B_PM, MemData, B_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BS_PM, L3Hit, BS_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BM_PM, L3Hit, BM_Pm) {} { + ptl_popTriggerQueue; + } + + transition(B_PM, L3Hit, B_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, BP}, CPUPrbResp) { + y_writeProbeDataToTBE; + x_decrementAcks; + o_checkForCompletion; + pr_popResponseQueue; + } + + transition(BS_PM, ProbeAcksComplete, BS_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BM_PM, ProbeAcksComplete, BM_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(B_PM, ProbeAcksComplete, B_M){} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_P, ProbeAcksComplete, U) { + wd_writeBackData; + alwt_allocateL3BlockOnWT; + we_wakeUpEvictionDependents; + dpf_deallocateProbeFilter; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } +} diff --git a/src/mem/protocol/MOESI_AMD_Base.slicc b/src/mem/protocol/MOESI_AMD_Base.slicc new file mode 100644 index 000000000..b38145246 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base.slicc @@ -0,0 +1,6 @@ +protocol "MOESI_AMD_Base"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "MOESI_AMD_Base-L3cache.sm"; +include "MOESI_AMD_Base-dir.sm"; diff --git a/src/mem/protocol/RubySlicc_ComponentMapping.sm b/src/mem/protocol/RubySlicc_ComponentMapping.sm index a72492b42..e1d7c4399 100644 --- a/src/mem/protocol/RubySlicc_ComponentMapping.sm +++ b/src/mem/protocol/RubySlicc_ComponentMapping.sm @@ -37,7 +37,10 @@ MachineID mapAddressToRange(Addr addr, MachineType type, NetDest broadcast(MachineType type); MachineID map_Address_to_DMA(Addr addr); MachineID map_Address_to_Directory(Addr addr); +MachineID map_Address_to_RegionDir(Addr addr); NodeID map_Address_to_DirectoryNode(Addr addr); +MachineID map_Address_to_TCCdir(Addr addr); +NodeID map_Address_to_TCCdirNode(Addr addr); NodeID machineIDToNodeID(MachineID machID); NodeID machineIDToVersion(MachineID machID); MachineType machineIDToMachineType(MachineID machID); diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm index 5ee26d65c..c743ebe28 100644 --- a/src/mem/protocol/RubySlicc_Exports.sm +++ b/src/mem/protocol/RubySlicc_Exports.sm @@ -62,7 +62,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt); // AccessPermission // The following five states define the access permission of all memory blocks. -// These permissions have multiple uses. They coordinate locking and +// These permissions have multiple uses. They coordinate locking and // synchronization primitives, as well as enable functional accesses. // One should not need to add any additional permission values and it is very // risky to do so. @@ -73,7 +73,7 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent") Read_Write, desc="block is Read/Write"; // Possibly Invalid data - // The maybe stale permission indicates that accordingly to the protocol, + // The maybe stale permission indicates that accordingly to the protocol, // there is no guarantee the block contains valid data. However, functional // writes should update the block because a dataless PUT request may // revalidate the block's data. @@ -227,6 +227,13 @@ enumeration(MachineType, desc="...", default="MachineType_NULL") { Collector, desc="Collector Mach"; L1Cache_wCC, desc="L1 Cache Mach to track cache-to-cache transfer (used for miss latency profile)"; L2Cache_wCC, desc="L2 Cache Mach to track cache-to-cache transfer (used for miss latency profile)"; + CorePair, desc="Cache Mach (2 cores, Private L1Ds, Shared L1I & L2)"; + TCP, desc="GPU L1 Data Cache (Texture Cache per Pipe)"; + TCC, desc="GPU L2 Shared Cache (Texture Cache per Channel)"; + TCCdir, desc="Directory at the GPU L2 Cache (TCC)"; + SQC, desc="GPU L1 Instr Cache (Sequencer Cache)"; + RegionDir, desc="Region-granular directory"; + RegionBuffer,desc="Region buffer for CPU and GPU"; NULL, desc="null mach type"; } diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm index a6c57e1b0..b8d284725 100644 --- a/src/mem/protocol/RubySlicc_Types.sm +++ b/src/mem/protocol/RubySlicc_Types.sm @@ -31,8 +31,8 @@ // // **PLEASE NOTE!** When adding objects to this file you must also add a line -// in the src/mem/ruby/SConscript file. Otherwise the external object's .hh -// file will not be copied to the protocol directory and you will encounter a +// in the src/mem/ruby/SConscript file. Otherwise the external object's .hh +// file will not be copied to the protocol directory and you will encounter a // undefined declaration error. // @@ -95,6 +95,8 @@ structure (NetDest, external = "yes", non_obj="yes") { bool intersectionIsEmpty(Set); bool intersectionIsEmpty(NetDest); MachineID smallestElement(MachineType); + NetDest OR(NetDest); + NetDest AND(NetDest); } structure (Sequencer, external = "yes") { @@ -117,6 +119,44 @@ structure (Sequencer, external = "yes") { void invalidateSC(Addr); } +structure (GPUCoalescer, external = "yes") { + void readCallback(Addr, DataBlock); + void readCallback(Addr, MachineType, DataBlock); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void writeCallback(Addr, DataBlock); + void writeCallback(Addr, MachineType, DataBlock); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void checkCoherence(Addr); + void evictionCallback(Addr); + void recordCPReadCallBack(MachineID, MachineID); + void recordCPWriteCallBack(MachineID, MachineID); +} + +structure (VIPERCoalescer, external = "yes") { + void readCallback(Addr, DataBlock); + void readCallback(Addr, MachineType, DataBlock); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void writeCallback(Addr, DataBlock); + void writeCallback(Addr, MachineType, DataBlock); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void invCallback(Addr); + void wbCallback(Addr); + void checkCoherence(Addr); + void evictionCallback(Addr); +} + structure(RubyRequest, desc="...", interface="Message", external="yes") { Addr LineAddress, desc="Line address for this request"; Addr PhysicalAddress, desc="Physical address for this request"; @@ -161,6 +201,7 @@ structure (CacheMemory, external = "yes") { Cycles getTagLatency(); Cycles getDataLatency(); void setMRU(Addr); + void setMRU(Addr, int); void setMRU(AbstractCacheEntry); void recordRequestType(CacheRequestType, Addr); bool checkResourceAvailable(CacheResourceType, Addr); diff --git a/src/mem/protocol/SConsopts b/src/mem/protocol/SConsopts index ca432a73e..47b36e276 100644 --- a/src/mem/protocol/SConsopts +++ b/src/mem/protocol/SConsopts @@ -33,6 +33,11 @@ import os Import('*') all_protocols.extend([ + 'GPU_VIPER', + 'GPU_VIPER_Baseline', + 'GPU_VIPER_Region', + 'GPU_RfO', + 'MOESI_AMD_Base', 'MESI_Two_Level', 'MESI_Three_Level', 'MI_example', diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript index 16e932432..82a16c9b0 100644 --- a/src/mem/ruby/SConscript +++ b/src/mem/ruby/SConscript @@ -124,13 +124,20 @@ MakeInclude('common/Set.hh') MakeInclude('common/WriteMask.hh') MakeInclude('filters/AbstractBloomFilter.hh') MakeInclude('network/MessageBuffer.hh') -MakeInclude('structures/Prefetcher.hh') MakeInclude('structures/CacheMemory.hh') -MakeInclude('system/DMASequencer.hh') MakeInclude('structures/DirectoryMemory.hh') -MakeInclude('structures/WireBuffer.hh') MakeInclude('structures/PerfectCacheMemory.hh') MakeInclude('structures/PersistentTable.hh') -MakeInclude('system/Sequencer.hh') +MakeInclude('structures/Prefetcher.hh') MakeInclude('structures/TBETable.hh') MakeInclude('structures/TimerTable.hh') +MakeInclude('structures/WireBuffer.hh') +MakeInclude('system/DMASequencer.hh') +MakeInclude('system/Sequencer.hh') + +# External types : Group "mem/protocol" : include "header.hh" to the bottom +# of this MakeIncludes if it is referenced as +# <# include "mem/protocol/header.hh"> in any file +# generated_dir = Dir('../protocol') +MakeInclude('system/GPUCoalescer.hh') +MakeInclude('system/VIPERCoalescer.hh') diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc index b3b37e5a6..7d3f20982 100644 --- a/src/mem/ruby/profiler/Profiler.cc +++ b/src/mem/ruby/profiler/Profiler.cc @@ -269,7 +269,7 @@ Profiler::collateStats() it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; - Sequencer *seq = ctr->getSequencer(); + Sequencer *seq = ctr->getCPUSequencer(); if (seq != NULL) { m_outstandReqHist.add(seq->getOutstandReqHist()); } @@ -282,7 +282,7 @@ Profiler::collateStats() it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; - Sequencer *seq = ctr->getSequencer(); + Sequencer *seq = ctr->getCPUSequencer(); if (seq != NULL) { // add all the latencies m_latencyHist.add(seq->getLatencyHist()); diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh index 926556781..cbd068c04 100644 --- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh +++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh @@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry virtual DataBlock& getDataBlk() { panic("getDataBlk() not implemented!"); } + int validBlocks; + virtual int& getNumValidBlocks() + { + return validBlocks; + } + // Functions for locking and unlocking the cache entry. These are required // for supporting atomic memory accesses. void setLocked(int context); diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc index 93fe50c88..458fde5bc 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.cc +++ b/src/mem/ruby/slicc_interface/AbstractController.cc @@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr) } } +bool +AbstractController::isBlocked(Addr addr) +{ + return (m_block_map.count(addr) > 0); +} + BaseMasterPort & AbstractController::getMasterPort(const std::string &if_name, PortID idx) diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 383507eed..4488ee3f4 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer // return instance name void blockOnQueue(Addr, MessageBuffer*); void unblock(Addr); + bool isBlocked(Addr); virtual MessageBuffer* getMandatoryQueue() const = 0; virtual MessageBuffer* getMemoryQueue() const = 0; @@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer virtual void regStats(); virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0; - virtual Sequencer* getSequencer() const = 0; + virtual Sequencer* getCPUSequencer() const = 0; //! These functions are used by ruby system to read/write the data blocks //! that exist with in the controller. diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh index 46071335e..cdedc2e14 100644 --- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh +++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh @@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr) return DirectoryMemory::mapAddressToDirectoryVersion(addr); } +inline NodeID +map_Address_to_TCCdirNode(Addr addr) +{ + return DirectoryMemory::mapAddressToDirectoryVersion(addr); +} + // used to determine the home directory // returns a value between 0 and total_directories_within_the_system inline MachineID @@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr) return mach; } +inline MachineID +map_Address_to_RegionDir(Addr addr) +{ + MachineID mach = {MachineType_RegionDir, + map_Address_to_DirectoryNode(addr)}; + return mach; +} + +inline MachineID +map_Address_to_TCCdir(Addr addr) +{ + MachineID mach = + {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)}; + return mach; +} + inline NetDest broadcast(MachineType type) { @@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id) return mach; } +inline MachineID +MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node) +{ + MachineID mach = {type, node}; + return mach; +} + #endif // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__ diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc index a8a3ba949..45fb85d05 100644 --- a/src/mem/ruby/structures/CacheMemory.cc +++ b/src/mem/ruby/structures/CacheMemory.cc @@ -35,6 +35,7 @@ #include "mem/protocol/AccessPermission.hh" #include "mem/ruby/structures/CacheMemory.hh" #include "mem/ruby/system/RubySystem.hh" +#include "mem/ruby/system/WeightedLRUPolicy.hh" using namespace std; @@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p) m_start_index_bit = p->start_index_bit; m_is_instruction_only_cache = p->is_icache; m_resource_stalls = p->resourceStalls; + m_block_size = p->block_size; // may be 0 at this point. Updated in init() } void CacheMemory::init() { - m_cache_num_sets = (m_cache_size / m_cache_assoc) / - RubySystem::getBlockSizeBytes(); + if (m_block_size == 0) { + m_block_size = RubySystem::getBlockSizeBytes(); + } + m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size; assert(m_cache_num_sets > 1); m_cache_num_set_bits = floorLog2(m_cache_num_sets); assert(m_cache_num_set_bits > 0); - m_cache.resize(m_cache_num_sets); - for (int i = 0; i < m_cache_num_sets; i++) { - m_cache[i].resize(m_cache_assoc); - for (int j = 0; j < m_cache_assoc; j++) { - m_cache[i][j] = NULL; - } - } + m_cache.resize(m_cache_num_sets, + std::vector(m_cache_assoc, nullptr)); } CacheMemory::~CacheMemory() { - if (m_replacementPolicy_ptr != NULL) + if (m_replacementPolicy_ptr) delete m_replacementPolicy_ptr; for (int i = 0; i < m_cache_num_sets; i++) { for (int j = 0; j < m_cache_assoc; j++) { @@ -358,6 +357,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e) m_replacementPolicy_ptr->touch(cacheSet, loc, curTick()); } +void +CacheMemory::setMRU(Addr address, int occupancy) +{ + int64_t cacheSet = addressToCacheSet(address); + int loc = findTagInSet(cacheSet, address); + + if(loc != -1) { + if (m_replacementPolicy_ptr->useOccupancy()) { + (static_cast(m_replacementPolicy_ptr))-> + touch(cacheSet, loc, curTick(), occupancy); + } else { + m_replacementPolicy_ptr-> + touch(cacheSet, loc, curTick()); + } + } +} + +int +CacheMemory::getReplacementWeight(int64_t set, int64_t loc) +{ + assert(set < m_cache_num_sets); + assert(loc < m_cache_assoc); + int ret = 0; + if(m_cache[set][loc] != NULL) { + ret = m_cache[set][loc]->getNumValidBlocks(); + assert(ret >= 0); + } + + return ret; +} + void CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const { diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh index 72805b32b..5b30505d3 100644 --- a/src/mem/ruby/structures/CacheMemory.hh +++ b/src/mem/ruby/structures/CacheMemory.hh @@ -106,7 +106,8 @@ class CacheMemory : public SimObject // Set this address to most recently used void setMRU(Addr address); - // Set this entry to most recently used + void setMRU(Addr addr, int occupancy); + int getReplacementWeight(int64_t set, int64_t loc); void setMRU(const AbstractCacheEntry *e); // Functions for locking and unlocking cache lines corresponding to the @@ -146,6 +147,7 @@ class CacheMemory : public SimObject Stats::Scalar numDataArrayStalls; int getCacheSize() const { return m_cache_size; } + int getCacheAssoc() const { return m_cache_assoc; } int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; } Addr getAddressAtIdx(int idx) const; @@ -182,6 +184,7 @@ class CacheMemory : public SimObject int m_cache_assoc; int m_start_index_bit; bool m_resource_stalls; + int m_block_size; }; std::ostream& operator<<(std::ostream& out, const CacheMemory& obj); diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py index 4eb87ac74..9fc4726b0 100644 --- a/src/mem/ruby/structures/RubyCache.py +++ b/src/mem/ruby/structures/RubyCache.py @@ -42,6 +42,7 @@ class RubyCache(SimObject): "") start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line"); is_icache = Param.Bool(False, "is instruction only cache"); + block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize") dataArrayBanks = Param.Int(1, "Number of banks for the data array") tagArrayBanks = Param.Int(1, "Number of banks for the tag array") diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc new file mode 100644 index 000000000..db279bd3a --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/GPUCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "debug/ProtocolTrace.hh" +#include "debug/RubyPort.hh" +#include "debug/RubyStats.hh" +#include "gpu-compute/shader.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/DataBlock.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/RubyGPUCoalescer.hh" + +using namespace std; + +GPUCoalescer * +RubyGPUCoalescerParams::create() +{ + return new GPUCoalescer(this); +} + +HSAScope +reqScopeToHSAScope(Request* req) +{ + HSAScope accessScope = HSAScope_UNSPECIFIED; + if (req->isScoped()) { + if (req->isWavefrontScope()) { + accessScope = HSAScope_WAVEFRONT; + } else if (req->isWorkgroupScope()) { + accessScope = HSAScope_WORKGROUP; + } else if (req->isDeviceScope()) { + accessScope = HSAScope_DEVICE; + } else if (req->isSystemScope()) { + accessScope = HSAScope_SYSTEM; + } else { + fatal("Bad scope type"); + } + } + return accessScope; +} + +HSASegment +reqSegmentToHSASegment(Request* req) +{ + HSASegment accessSegment = HSASegment_GLOBAL; + + if (req->isGlobalSegment()) { + accessSegment = HSASegment_GLOBAL; + } else if (req->isGroupSegment()) { + accessSegment = HSASegment_GROUP; + } else if (req->isPrivateSegment()) { + accessSegment = HSASegment_PRIVATE; + } else if (req->isKernargSegment()) { + accessSegment = HSASegment_KERNARG; + } else if (req->isReadonlySegment()) { + accessSegment = HSASegment_READONLY; + } else if (req->isSpillSegment()) { + accessSegment = HSASegment_SPILL; + } else if (req->isArgSegment()) { + accessSegment = HSASegment_ARG; + } else { + fatal("Bad segment type"); + } + + return accessSegment; +} + +GPUCoalescer::GPUCoalescer(const Params *p) + : RubyPort(p), issueEvent(this), deadlockCheckEvent(this) +{ + m_store_waiting_on_load_cycles = 0; + m_store_waiting_on_store_cycles = 0; + m_load_waiting_on_store_cycles = 0; + m_load_waiting_on_load_cycles = 0; + + m_outstanding_count = 0; + + m_max_outstanding_requests = 0; + m_deadlock_threshold = 0; + m_instCache_ptr = nullptr; + m_dataCache_ptr = nullptr; + + m_instCache_ptr = p->icache; + m_dataCache_ptr = p->dcache; + m_max_outstanding_requests = p->max_outstanding_requests; + m_deadlock_threshold = p->deadlock_threshold; + + assert(m_max_outstanding_requests > 0); + assert(m_deadlock_threshold > 0); + assert(m_instCache_ptr); + assert(m_dataCache_ptr); + + m_data_cache_hit_latency = p->dcache_hit_latency; + + m_usingNetworkTester = p->using_network_tester; + assumingRfOCoherence = p->assume_rfo; +} + +GPUCoalescer::~GPUCoalescer() +{ +} + +void +GPUCoalescer::wakeup() +{ + // Check for deadlock of any of the requests + Cycles current_time = curCycle(); + + // Check across all outstanding requests + int total_outstanding = 0; + + RequestTable::iterator read = m_readRequestTable.begin(); + RequestTable::iterator read_end = m_readRequestTable.end(); + for (; read != read_end; ++read) { + GPUCoalescerRequest* request = read->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_readRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_readRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time)*clockPeriod()); + } + + RequestTable::iterator write = m_writeRequestTable.begin(); + RequestTable::iterator write_end = m_writeRequestTable.end(); + for (; write != write_end; ++write) { + GPUCoalescerRequest* request = write->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_writeRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_writeRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time) * clockPeriod()); + } + + total_outstanding += m_writeRequestTable.size(); + total_outstanding += m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + if (m_outstanding_count > 0) { + // If there are still outstanding requests, keep checking + schedule(deadlockCheckEvent, + m_deadlock_threshold * clockPeriod() + + curTick()); + } +} + +void +GPUCoalescer::resetStats() +{ + m_latencyHist.reset(); + m_missLatencyHist.reset(); + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist[i]->reset(); + m_missTypeLatencyHist[i]->reset(); + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i][j]->reset(); + } + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist[i]->reset(); + + m_IssueToInitialDelayHist[i]->reset(); + m_InitialToForwardDelayHist[i]->reset(); + m_ForwardToFirstResponseDelayHist[i]->reset(); + m_FirstResponseToCompletionDelayHist[i]->reset(); + } +} + +void +GPUCoalescer::printProgress(ostream& out) const +{ +} + +RequestStatus +GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) +{ + Addr line_addr = makeLineAddress(pkt->getAddr()); + + if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { + return RequestStatus_BufferFull; + } + + if(m_controller->isBlocked(line_addr) && + request_type != RubyRequestType_Locked_RMW_Write) { + return RequestStatus_Aliased; + } + + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + // Check if there is any outstanding read request for the same + // cache line. + if (m_readRequestTable.count(line_addr) > 0) { + m_store_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + + if (m_writeRequestTable.count(line_addr) > 0) { + // There is an outstanding write request for the cache line + m_store_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + } else { + // Check if there is any outstanding write request for the same + // cache line. + if (m_writeRequestTable.count(line_addr) > 0) { + m_load_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + + if (m_readRequestTable.count(line_addr) > 0) { + // There is an outstanding read request for the cache line + m_load_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + } + + return RequestStatus_Ready; + +} + + + +// sets the kernelEndList +void +GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) +{ + // Don't know if this will happen or is possible + // but I just want to be careful and not have it become + // simulator hang in the future + DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); + assert(kernelEndList.count(wavefront_id) == 0); + + kernelEndList[wavefront_id] = pkt; + DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", + kernelEndList.size()); +} + + +// Insert the request on the correct request table. Return true if +// the entry was already present. +bool +GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) +{ + assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || + pkt->req->isLockedRMW() || + !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); + + int total_outstanding M5_VAR_USED = + m_writeRequestTable.size() + m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + // See if we should schedule a deadlock check + if (deadlockCheckEvent.scheduled() == false) { + schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); + } + + Addr line_addr = makeLineAddress(pkt->getAddr()); + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + pair r = + m_writeRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting write request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } else { + pair r = + m_readRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting read request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } + + m_outstandReqHist.sample(m_outstanding_count); + + total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); + assert(m_outstanding_count == total_outstanding); + + return false; +} + +void +GPUCoalescer::markRemoved() +{ + m_outstanding_count--; + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); +} + +void +GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) +{ + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); + + Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); + if ((srequest->m_type == RubyRequestType_ST) || + (srequest->m_type == RubyRequestType_RMW_Read) || + (srequest->m_type == RubyRequestType_RMW_Write) || + (srequest->m_type == RubyRequestType_Load_Linked) || + (srequest->m_type == RubyRequestType_Store_Conditional) || + (srequest->m_type == RubyRequestType_Locked_RMW_Read) || + (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { + m_writeRequestTable.erase(line_addr); + } else { + m_readRequestTable.erase(line_addr); + } + + markRemoved(); +} + +bool +GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) +{ + // + // The success flag indicates whether the LLSC operation was successful. + // LL ops will always succeed, but SC may fail if the cache line is no + // longer locked. + // + bool success = true; + if (request->m_type == RubyRequestType_Store_Conditional) { + if (!m_dataCache_ptr->isLocked(address, m_version)) { + // + // For failed SC requests, indicate the failure to the cpu by + // setting the extra data to zero. + // + request->pkt->req->setExtraData(0); + success = false; + } else { + // + // For successful SC requests, indicate the success to the cpu by + // setting the extra data to one. + // + request->pkt->req->setExtraData(1); + } + // + // Independent of success, all SC operations must clear the lock + // + m_dataCache_ptr->clearLocked(address); + } else if (request->m_type == RubyRequestType_Load_Linked) { + // + // Note: To fully follow Alpha LLSC semantics, should the LL clear any + // previously locked cache lines? + // + m_dataCache_ptr->setLocked(address, m_version); + } else if ((m_dataCache_ptr->isTagPresent(address)) && + (m_dataCache_ptr->isLocked(address, m_version))) { + // + // Normal writes should clear the locked address + // + m_dataCache_ptr->clearLocked(address); + } + return success; +} + +void +GPUCoalescer::writeCallback(Addr address, DataBlock& data) +{ + writeCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + writeCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_ST) || + (request->m_type == RubyRequestType_ATOMIC) || + (request->m_type == RubyRequestType_ATOMIC_RETURN) || + (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request->m_type == RubyRequestType_RMW_Read) || + (request->m_type == RubyRequestType_RMW_Write) || + (request->m_type == RubyRequestType_Load_Linked) || + (request->m_type == RubyRequestType_Store_Conditional) || + (request->m_type == RubyRequestType_Locked_RMW_Read) || + (request->m_type == RubyRequestType_Locked_RMW_Write) || + (request->m_type == RubyRequestType_FLUSH)); + + + // + // For Alpha, properly handle LL, SC, and write requests with respect to + // locked cache blocks. + // + // Not valid for Network_test protocl + // + bool success = true; + if(!m_usingNetworkTester) + success = handleLlsc(address, request); + + if (request->m_type == RubyRequestType_Locked_RMW_Read) { + m_controller->blockOnQueue(address, m_mandatory_q_ptr); + } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { + m_controller->unblock(address); + } + + hitCallback(request, mach, data, success, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::readCallback(Addr address, DataBlock& data) +{ + readCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + + readCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + assert(m_readRequestTable.count(makeLineAddress(address))); + + DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_readRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_LD) || + (request->m_type == RubyRequestType_IFETCH)); + + hitCallback(request, mach, data, true, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(request_address); + + RubyRequestType type = srequest->m_type; + + // Set this cache entry to the most recently used + if (type == RubyRequestType_IFETCH) { + if (m_instCache_ptr->isTagPresent(request_line_address)) + m_instCache_ptr->setMRU(request_line_address); + } else { + if (m_dataCache_ptr->isTagPresent(request_line_address)) + m_dataCache_ptr->setMRU(request_line_address); + } + + recordMissLatency(srequest, mach, + initialRequestTime, + forwardRequestTime, + firstResponseTime, + success, isRegion); + // update the data + // + // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER + int len = reqCoalescer[request_line_address].size(); + std::vector mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = pkt->getAddr(); + request_line_address = makeLineAddress(pkt->getAddr()); + if (pkt->getPtr()) { + if ((type == RubyRequestType_LD) || + (type == RubyRequestType_ATOMIC) || + (type == RubyRequestType_ATOMIC_RETURN) || + (type == RubyRequestType_IFETCH) || + (type == RubyRequestType_RMW_Read) || + (type == RubyRequestType_Locked_RMW_Read) || + (type == RubyRequestType_Load_Linked)) { + memcpy(pkt->getPtr(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + data.setData(pkt->getPtr(), + getOffset(request_address), pkt->getSize()); + } + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + + + completeHitCallback(mylist, len); +} + +bool +GPUCoalescer::empty() const +{ + return m_writeRequestTable.empty() && m_readRequestTable.empty(); +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +GPUCoalescer::makeRequest(PacketPtr pkt) +{ + // Check for GPU Barrier Kernel End or Kernel Begin + // Leave these to be handled by the child class + // Kernel End/Barrier = isFlush + isRelease + // Kernel Begin = isFlush + isAcquire + if (pkt->req->isKernel()) { + if (pkt->req->isAcquire()){ + // This is a Kernel Begin leave handling to + // virtual xCoalescer::makeRequest + return RequestStatus_Issued; + }else if(pkt->req->isRelease()) { + // This is a Kernel End leave handling to + // virtual xCoalescer::makeRequest + // If we are here then we didn't call + // a virtual version of this function + // so we will also schedule the callback + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } + } + + // If number of outstanding requests greater than the max allowed, + // return RequestStatus_BufferFull. This logic can be extended to + // support proper backpressure. + if (m_outstanding_count >= m_max_outstanding_requests) { + return RequestStatus_BufferFull; + } + + RubyRequestType primary_type = RubyRequestType_NULL; + RubyRequestType secondary_type = RubyRequestType_NULL; + + if (pkt->isLLSC()) { + // + // Alpha LL/SC instructions need to be handled carefully by the cache + // coherence protocol to ensure they follow the proper semantics. In + // particular, by identifying the operations as atomic, the protocol + // should understand that migratory sharing optimizations should not + // be performed (i.e. a load between the LL and SC should not steal + // away exclusive permission). + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Store_Conditional; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Load_Linked; + } + secondary_type = RubyRequestType_ATOMIC; + } else if (pkt->req->isLockedRMW()) { + // + // x86 locked instructions are translated to store cache coherence + // requests because these requests should always be treated as read + // exclusive operations and should leverage any migratory sharing + // optimization built into the protocol. + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Locked_RMW_Write; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Locked_RMW_Read; + } + secondary_type = RubyRequestType_ST; + } else if (pkt->isAtomicOp()) { + // + // GPU Atomic Operation + // + primary_type = RubyRequestType_ATOMIC; + secondary_type = RubyRequestType_ATOMIC; + } else { + if (pkt->isRead()) { + if (pkt->req->isInstFetch()) { + primary_type = secondary_type = RubyRequestType_IFETCH; + } else { +#if THE_ISA == X86_ISA + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & + (TheISA::StoreCheck << TheISA::FlagShift); +#else + bool storeCheck = false; +#endif // X86_ISA + if (storeCheck) { + primary_type = RubyRequestType_RMW_Read; + secondary_type = RubyRequestType_ST; + } else { + primary_type = secondary_type = RubyRequestType_LD; + } + } + } else if (pkt->isWrite()) { + // + // Note: M5 packets do not differentiate ST from RMW_Write + // + primary_type = secondary_type = RubyRequestType_ST; + } else if (pkt->isFlush()) { + primary_type = secondary_type = RubyRequestType_FLUSH; + } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { + if (assumingRfOCoherence) { + // If we reached here, this request must be a memFence + // and the protocol implements RfO, the coalescer can + // assume sequentially consistency and schedule the callback + // immediately. + // Currently the code implements fence callbacks + // by reusing the mechanism for kernel completions. + // This should be fixed. + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } else { + // If not RfO, return issued here and let the child coalescer + // take care of it. + return RequestStatus_Issued; + } + } else { + panic("Unsupported ruby packet type\n"); + } + } + + // Check if there is any pending request to this cache line from + // previous cycles. + // If there is a pending request, return aliased. Since coalescing + // across time is not permitted, aliased requests are not coalesced. + // If a request for this address has already been issued, we must block + RequestStatus status = getRequestStatus(pkt, primary_type); + if (status != RequestStatus_Ready) + return status; + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Check if this request can be coalesced with previous + // requests from this cycle. + if (!reqCoalescer.count(line_addr)) { + // This is the first access to this cache line. + // A new request to the memory subsystem has to be + // made in the next cycle for this cache line, so + // add this line addr to the "newRequests" queue + newRequests.push_back(line_addr); + + // There was a request to this cache line in this cycle, + // let us see if we can coalesce this request with the previous + // requests from this cycle + } else if (primary_type != + reqCoalescer[line_addr][0].second[PrimaryType]) { + // can't coalesce loads, stores and atomics! + return RequestStatus_Aliased; + } else if (pkt->req->isLockedRMW() || + reqCoalescer[line_addr][0].first->req->isLockedRMW()) { + // can't coalesce locked accesses, but can coalesce atomics! + return RequestStatus_Aliased; + } else if (pkt->req->hasContextId() && pkt->req->isRelease() && + pkt->req->contextId() != + reqCoalescer[line_addr][0].first->req->contextId()) { + // can't coalesce releases from different wavefronts + return RequestStatus_Aliased; + } + + // in addition to the packet, we need to save both request types + reqCoalescer[line_addr].push_back( + RequestDesc(pkt, std::vector()) ); + reqCoalescer[line_addr].back().second.push_back(primary_type); + reqCoalescer[line_addr].back().second.push_back(secondary_type); + if (!issueEvent.scheduled()) + schedule(issueEvent, curTick()); + // TODO: issue hardware prefetches here + return RequestStatus_Issued; +} + +void +GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) +{ + + int proc_id = -1; + if (pkt != NULL && pkt->req->hasContextId()) { + proc_id = pkt->req->contextId(); + } + + // If valid, copy the pc to the ruby request + Addr pc = 0; + if (pkt->req->hasPC()) { + pc = pkt->req->getPC(); + } + + // At the moment setting scopes only counts + // for GPU spill space accesses + // which is pkt->req->isStack() + // this scope is REPLACE since it + // does not need to be flushed at the end + // of a kernel Private and local may need + // to be visible at the end of the kernel + HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); + HSAScope accessScope = reqScopeToHSAScope(pkt->req); + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Creating WriteMask that records written bytes + // and atomic operations. This enables partial writes + // and partial reads of those writes + DataBlock dataBlock; + dataBlock.clear(); + uint32_t blockSize = RubySystem::getBlockSizeBytes(); + std::vector accessMask(blockSize,false); + std::vector< std::pair > atomicOps; + uint32_t tableSize = reqCoalescer[line_addr].size(); + for (int i = 0; i < tableSize; i++) { + PacketPtr tmpPkt = reqCoalescer[line_addr][i].first; + uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; + uint32_t tmpSize = tmpPkt->getSize(); + if (tmpPkt->isAtomicOp()) { + std::pair tmpAtomicOp(tmpOffset, + tmpPkt->getAtomicOp()); + atomicOps.push_back(tmpAtomicOp); + } else if(tmpPkt->isWrite()) { + dataBlock.setData(tmpPkt->getPtr(), + tmpOffset, tmpSize); + } + for (int j = 0; j < tmpSize; j++) { + accessMask[tmpOffset + j] = true; + } + } + std::shared_ptr msg; + if (pkt->isAtomicOp()) { + msg = std::make_shared(clockEdge(), pkt->getAddr(), + pkt->getPtr(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, atomicOps, + accessScope, accessSegment); + } else { + msg = std::make_shared(clockEdge(), pkt->getAddr(), + pkt->getPtr(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, + accessScope, accessSegment); + } + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", + curTick(), m_version, "Coal", "Begin", "", "", + printAddress(msg->getPhysicalAddress()), + RubyRequestType_to_string(secondary_type)); + + fatal_if(secondary_type == RubyRequestType_IFETCH, + "there should not be any I-Fetch requests in the GPU Coalescer"); + + // Send the message to the cache controller + fatal_if(m_data_cache_hit_latency == 0, + "should not have a latency of zero"); + + assert(m_mandatory_q_ptr); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); +} + +template +std::ostream & +operator<<(ostream &out, const std::unordered_map &map) +{ + out << "["; + for (auto i = map.begin(); i != map.end(); ++i) + out << " " << i->first << "=" << i->second; + out << " ]"; + + return out; +} + +void +GPUCoalescer::print(ostream& out) const +{ + out << "[GPUCoalescer: " << m_version + << ", outstanding requests: " << m_outstanding_count + << ", read request table: " << m_readRequestTable + << ", write request table: " << m_writeRequestTable + << "]"; +} + +// this can be called from setState whenever coherence permissions are +// upgraded when invoked, coherence violations will be checked for the +// given block +void +GPUCoalescer::checkCoherence(Addr addr) +{ +#ifdef CHECK_COHERENCE + m_ruby_system->checkGlobalCoherenceInvariant(addr); +#endif +} + +void +GPUCoalescer::recordRequestType(SequencerRequestType requestType) { + DPRINTF(RubyStats, "Recorded statistic: %s\n", + SequencerRequestType_to_string(requestType)); +} + +GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq) + : Event(Progress_Event_Pri), seq(_seq) +{ +} + + +void +GPUCoalescer::completeIssue() +{ + // newRequests has the cacheline addresses of all the + // requests which need to be issued to the memory subsystem + // in this cycle + int len = newRequests.size(); + DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); + for (int i = 0; i < len; ++i) { + // Get the requests from reqCoalescer table. Get only the + // first request for each cacheline, the remaining requests + // can be coalesced with the first request. So, only + // one request is issued per cacheline. + RequestDesc info = reqCoalescer[newRequests[i]][0]; + PacketPtr pkt = info.first; + DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", + i, pkt->req->getPaddr()); + // Insert this request to the read/writeRequestTables. These tables + // are used to track aliased requests in makeRequest subroutine + bool found = insertRequest(pkt, info.second[PrimaryType]); + + if (found) { + panic("GPUCoalescer::makeRequest should never be called if the " + "request is already outstanding\n"); + } + + // Issue request to ruby subsystem + issueRequest(pkt, info.second[SecondaryType]); + } + newRequests.clear(); + + // have Kernel End releases been issued this cycle + len = newKernelEnds.size(); + for (int i = 0; i < len; i++) { + kernelCallback(newKernelEnds[i]); + } + newKernelEnds.clear(); +} + +void +GPUCoalescer::IssueEvent::process() +{ + seq->completeIssue(); +} + +const char * +GPUCoalescer::IssueEvent::description() const +{ + return "Issue coalesced request"; +} + +void +GPUCoalescer::evictionCallback(Addr address) +{ + ruby_eviction_callback(address); +} + +void +GPUCoalescer::kernelCallback(int wavefront_id) +{ + assert(kernelEndList.count(wavefront_id)); + + ruby_hit_callback(kernelEndList[wavefront_id]); + + kernelEndList.erase(wavefront_id); +} + +void +GPUCoalescer::atomicCallback(Addr address, + MachineType mach, + const DataBlock& data) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* srequest = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((srequest->m_type == RubyRequestType_ATOMIC) || + (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || + (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); + + + // Atomics don't write to cache, so there is no MRU update... + + recordMissLatency(srequest, mach, + srequest->issue_time, Cycles(0), Cycles(0), true, false); + + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(pkt->getAddr()); + + int len = reqCoalescer[request_line_address].size(); + std::vector mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(srequest->m_type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = (pkt->getAddr()); + request_line_address = makeLineAddress(request_address); + if (pkt->getPtr() && + srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { + /* atomics are done in memory, and return the data *before* the atomic op... */ + memcpy(pkt->getPtr(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(srequest->m_type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + completeHitCallback(mylist, len); +} + +void +GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPLdHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPLdTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCLdHits++; + } else { + CP_LdMiss++; + } +} + +void +GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPStHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPStTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCStHits++; + } else { + CP_StMiss++; + } +} + +void +GPUCoalescer::completeHitCallback(std::vector & mylist, int len) +{ + for (int i = 0; i < len; ++i) { + RubyPort::SenderState *ss = + safe_cast(mylist[i]->senderState); + MemSlavePort *port = ss->port; + assert(port != NULL); + + mylist[i]->senderState = ss->predecessor; + delete ss; + port->hitCallback(mylist[i]); + trySendRetries(); + } + + testDrainComplete(); +} + +PacketPtr +GPUCoalescer::mapAddrToPkt(Addr address) +{ + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + return request->pkt; +} + +void +GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion) +{ + RubyRequestType type = srequest->m_type; + Cycles issued_time = srequest->issue_time; + Cycles completion_time = curCycle(); + assert(completion_time >= issued_time); + Cycles total_lat = completion_time - issued_time; + + // cache stats (valid for RfO protocol only) + if (mach == MachineType_TCP) { + if (type == RubyRequestType_LD) { + GPU_TCPLdHits++; + } else { + GPU_TCPStHits++; + } + } else if (mach == MachineType_L1Cache_wCC) { + if (type == RubyRequestType_LD) { + GPU_TCPLdTransfers++; + } else { + GPU_TCPStTransfers++; + } + } else if (mach == MachineType_TCC) { + if (type == RubyRequestType_LD) { + GPU_TCCLdHits++; + } else { + GPU_TCCStHits++; + } + } else { + if (type == RubyRequestType_LD) { + GPU_LdMiss++; + } else { + GPU_StMiss++; + } + } + + // Profile all access latency, even zero latency accesses + m_latencyHist.sample(total_lat); + m_typeLatencyHist[type]->sample(total_lat); + + // Profile the miss latency for all non-zero demand misses + if (total_lat != Cycles(0)) { + m_missLatencyHist.sample(total_lat); + m_missTypeLatencyHist[type]->sample(total_lat); + + if (mach != MachineType_NUM) { + m_missMachLatencyHist[mach]->sample(total_lat); + m_missTypeMachLatencyHist[type][mach]->sample(total_lat); + + if ((issued_time <= initialRequestTime) && + (initialRequestTime <= forwardRequestTime) && + (forwardRequestTime <= firstResponseTime) && + (firstResponseTime <= completion_time)) { + + m_IssueToInitialDelayHist[mach]->sample( + initialRequestTime - issued_time); + m_InitialToForwardDelayHist[mach]->sample( + forwardRequestTime - initialRequestTime); + m_ForwardToFirstResponseDelayHist[mach]->sample( + firstResponseTime - forwardRequestTime); + m_FirstResponseToCompletionDelayHist[mach]->sample( + completion_time - firstResponseTime); + } + } + + } + + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", + curTick(), m_version, "Coal", + success ? "Done" : "SC_Failed", "", "", + printAddress(srequest->pkt->getAddr()), total_lat); +} + +void +GPUCoalescer::regStats() +{ + // These statistical variables are not for display. + // The profiler will collate these across different + // coalescers and display those collated statistics. + m_outstandReqHist.init(10); + m_latencyHist.init(10); + m_missLatencyHist.init(10); + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist.push_back(new Stats::Histogram()); + m_typeLatencyHist[i]->init(10); + + m_missTypeLatencyHist.push_back(new Stats::Histogram()); + m_missTypeLatencyHist[i]->init(10); + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist.push_back(new Stats::Histogram()); + m_missMachLatencyHist[i]->init(10); + + m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); + m_IssueToInitialDelayHist[i]->init(10); + + m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); + m_InitialToForwardDelayHist[i]->init(10); + + m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); + m_ForwardToFirstResponseDelayHist[i]->init(10); + + m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); + m_FirstResponseToCompletionDelayHist[i]->init(10); + } + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_missTypeMachLatencyHist.push_back(std::vector()); + + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); + m_missTypeMachLatencyHist[i][j]->init(10); + } + } + + // GPU cache stats + GPU_TCPLdHits + .name(name() + ".gpu_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + GPU_TCPLdTransfers + .name(name() + ".gpu_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + GPU_TCCLdHits + .name(name() + ".gpu_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + GPU_LdMiss + .name(name() + ".gpu_ld_misses") + .desc("loads that miss in the GPU") + ; + + GPU_TCPStHits + .name(name() + ".gpu_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + GPU_TCPStTransfers + .name(name() + ".gpu_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + GPU_TCCStHits + .name(name() + ".gpu_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + GPU_StMiss + .name(name() + ".gpu_st_misses") + .desc("stores that miss in the GPU") + ; + + // CP cache stats + CP_TCPLdHits + .name(name() + ".cp_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + CP_TCPLdTransfers + .name(name() + ".cp_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + CP_TCCLdHits + .name(name() + ".cp_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + CP_LdMiss + .name(name() + ".cp_ld_misses") + .desc("loads that miss in the GPU") + ; + + CP_TCPStHits + .name(name() + ".cp_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + CP_TCPStTransfers + .name(name() + ".cp_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + CP_TCCStHits + .name(name() + ".cp_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + CP_StMiss + .name(name() + ".cp_st_misses") + .desc("stores that miss in the GPU") + ; +} diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh new file mode 100644 index 000000000..dbd47059c --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ +#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ + +#include +#include + +#include "base/statistics.hh" +#include "mem/protocol/HSAScope.hh" +#include "mem/protocol/HSASegment.hh" +#include "mem/protocol/PrefetchBit.hh" +#include "mem/protocol/RubyAccessMode.hh" +#include "mem/protocol/RubyRequestType.hh" +#include "mem/protocol/SequencerRequestType.hh" +#include "mem/request.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/common/Consumer.hh" +#include "mem/ruby/system/RubyPort.hh" + +class DataBlock; +class CacheMsg; +class MachineID; +class CacheMemory; + +class RubyGPUCoalescerParams; + +HSAScope reqScopeToHSAScope(Request* req); +HSASegment reqSegmentToHSASegment(Request* req); + +struct GPUCoalescerRequest +{ + PacketPtr pkt; + RubyRequestType m_type; + Cycles issue_time; + + GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type, + Cycles _issue_time) + : pkt(_pkt), m_type(_m_type), issue_time(_issue_time) + {} +}; + +std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj); + +class GPUCoalescer : public RubyPort +{ + public: + typedef RubyGPUCoalescerParams Params; + GPUCoalescer(const Params *); + ~GPUCoalescer(); + + // Public Methods + void wakeup(); // Used only for deadlock detection + + void printProgress(std::ostream& out) const; + void resetStats(); + void collateStats(); + void regStats(); + + void writeCallback(Addr address, DataBlock& data); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime); + + void readCallback(Addr address, DataBlock& data); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + /* atomics need their own callback because the data + might be const coming from SLICC */ + void atomicCallback(Addr address, + MachineType mach, + const DataBlock& data); + + void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID); + void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID); + + // Alternate implementations in VIPER Coalescer + virtual RequestStatus makeRequest(PacketPtr pkt); + + int outstandingCount() const { return m_outstanding_count; } + + bool + isDeadlockEventScheduled() const + { + return deadlockCheckEvent.scheduled(); + } + + void + descheduleDeadlockEvent() + { + deschedule(deadlockCheckEvent); + } + + bool empty() const; + + void print(std::ostream& out) const; + void checkCoherence(Addr address); + + void markRemoved(); + void removeRequest(GPUCoalescerRequest* request); + void evictionCallback(Addr address); + void completeIssue(); + + void insertKernel(int wavefront_id, PacketPtr pkt); + + void recordRequestType(SequencerRequestType requestType); + Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } + + Stats::Histogram& getLatencyHist() { return m_latencyHist; } + Stats::Histogram& getTypeLatencyHist(uint32_t t) + { return *m_typeLatencyHist[t]; } + + Stats::Histogram& getMissLatencyHist() + { return m_missLatencyHist; } + Stats::Histogram& getMissTypeLatencyHist(uint32_t t) + { return *m_missTypeLatencyHist[t]; } + + Stats::Histogram& getMissMachLatencyHist(uint32_t t) const + { return *m_missMachLatencyHist[t]; } + + Stats::Histogram& + getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const + { return *m_missTypeMachLatencyHist[r][t]; } + + Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const + { return *m_IssueToInitialDelayHist[t]; } + + Stats::Histogram& + getInitialToForwardDelayHist(const MachineType t) const + { return *m_InitialToForwardDelayHist[t]; } + + Stats::Histogram& + getForwardRequestToFirstResponseHist(const MachineType t) const + { return *m_ForwardToFirstResponseDelayHist[t]; } + + Stats::Histogram& + getFirstResponseToCompletionDelayHist(const MachineType t) const + { return *m_FirstResponseToCompletionDelayHist[t]; } + + // Changed to protected to enable inheritance by VIPER Coalescer + protected: + bool tryCacheAccess(Addr addr, RubyRequestType type, + Addr pc, RubyAccessMode access_mode, + int size, DataBlock*& data_ptr); + // Alternate implementations in VIPER Coalescer + virtual void issueRequest(PacketPtr pkt, RubyRequestType type); + + void kernelCallback(int wavfront_id); + + void hitCallback(GPUCoalescerRequest* request, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + void recordMissLatency(GPUCoalescerRequest* request, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion); + void completeHitCallback(std::vector & mylist, int len); + PacketPtr mapAddrToPkt(Addr address); + + + RequestStatus getRequestStatus(PacketPtr pkt, + RubyRequestType request_type); + bool insertRequest(PacketPtr pkt, RubyRequestType request_type); + + bool handleLlsc(Addr address, GPUCoalescerRequest* request); + + // Private copy constructor and assignment operator + GPUCoalescer(const GPUCoalescer& obj); + GPUCoalescer& operator=(const GPUCoalescer& obj); + + class IssueEvent : public Event + { + private: + GPUCoalescer *seq; + public: + IssueEvent(GPUCoalescer *_seq); + void process(); + const char *description() const; + }; + + IssueEvent issueEvent; + + + // Changed to protected to enable inheritance by VIPER Coalescer + protected: + int m_max_outstanding_requests; + int m_deadlock_threshold; + + CacheMemory* m_dataCache_ptr; + CacheMemory* m_instCache_ptr; + + // The cache access latency for this GPU data cache. This is assessed at the + // beginning of each access. This should be very similar to the + // implementation in Sequencer() as this is very much like a Sequencer + Cycles m_data_cache_hit_latency; + + // We need to track both the primary and secondary request types. + // The secondary request type comprises a subset of RubyRequestTypes that + // are understood by the L1 Controller. A primary request type can be any + // RubyRequestType. + enum {PrimaryType, SecondaryType}; + typedef std::pair > RequestDesc; + typedef std::unordered_map > CoalescingTable; + CoalescingTable reqCoalescer; + std::vector newRequests; + + typedef std::unordered_map RequestTable; + RequestTable m_writeRequestTable; + RequestTable m_readRequestTable; + // Global outstanding request count, across all request tables + int m_outstanding_count; + bool m_deadlock_check_scheduled; + std::unordered_map kernelEndList; + std::vector newKernelEnds; + + int m_store_waiting_on_load_cycles; + int m_store_waiting_on_store_cycles; + int m_load_waiting_on_store_cycles; + int m_load_waiting_on_load_cycles; + + bool m_usingNetworkTester; + + class GPUCoalescerWakeupEvent : public Event + { + private: + GPUCoalescer *m_GPUCoalescer_ptr; + + public: + GPUCoalescerWakeupEvent(GPUCoalescer *_seq) : + m_GPUCoalescer_ptr(_seq) {} + void process() { m_GPUCoalescer_ptr->wakeup(); } + const char *description() const + { + return "GPUCoalescer deadlock check"; + } + }; + + GPUCoalescerWakeupEvent deadlockCheckEvent; + bool assumingRfOCoherence; + + // m5 style stats for TCP hit/miss counts + Stats::Scalar GPU_TCPLdHits; + Stats::Scalar GPU_TCPLdTransfers; + Stats::Scalar GPU_TCCLdHits; + Stats::Scalar GPU_LdMiss; + + Stats::Scalar GPU_TCPStHits; + Stats::Scalar GPU_TCPStTransfers; + Stats::Scalar GPU_TCCStHits; + Stats::Scalar GPU_StMiss; + + Stats::Scalar CP_TCPLdHits; + Stats::Scalar CP_TCPLdTransfers; + Stats::Scalar CP_TCCLdHits; + Stats::Scalar CP_LdMiss; + + Stats::Scalar CP_TCPStHits; + Stats::Scalar CP_TCPStTransfers; + Stats::Scalar CP_TCCStHits; + Stats::Scalar CP_StMiss; + + //! Histogram for number of outstanding requests per cycle. + Stats::Histogram m_outstandReqHist; + + //! Histogram for holding latency profile of all requests. + Stats::Histogram m_latencyHist; + std::vector m_typeLatencyHist; + + //! Histogram for holding latency profile of all requests that + //! miss in the controller connected to this sequencer. + Stats::Histogram m_missLatencyHist; + std::vector m_missTypeLatencyHist; + + //! Histograms for profiling the latencies for requests that + //! required external messages. + std::vector m_missMachLatencyHist; + std::vector< std::vector > m_missTypeMachLatencyHist; + + //! Histograms for recording the breakdown of miss latency + std::vector m_IssueToInitialDelayHist; + std::vector m_InitialToForwardDelayHist; + std::vector m_ForwardToFirstResponseDelayHist; + std::vector m_FirstResponseToCompletionDelayHist; +}; + +inline std::ostream& +operator<<(std::ostream& out, const GPUCoalescer& obj) +{ + obj.print(out); + out << std::flush; + return out; +} + +#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ + diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py new file mode 100644 index 000000000..0c19f875d --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Steve Reinhardt +# Brad Beckmann + +from m5.params import * +from m5.proxy import * +from Sequencer import * + +class RubyGPUCoalescer(RubySequencer): + type = 'RubyGPUCoalescer' + cxx_class = 'GPUCoalescer' + cxx_header = "mem/ruby/system/GPUCoalescer.hh" + + # max_outstanding_requests = (wave front slots) x (wave front size) + max_outstanding_requests = Param.Int(40*64, + "max requests (incl. prefetches) outstanding") + assume_rfo = Param.Bool(True, "assume protocol implementes Read for " + "Ownership coherence"); diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 5a5f528bb..bf4002126 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p) memSlavePort(csprintf("%s-mem-slave-port", name()), this, p->ruby_system->getAccessBackingStore(), -1, p->no_retry_on_stall), - gotAddrRanges(p->port_master_connection_count) + gotAddrRanges(p->port_master_connection_count), + m_isCPUSequencer(p->is_cpu_sequencer) { assert(m_version != -1); diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh index 07e0fde5a..6bd92b654 100644 --- a/src/mem/ruby/system/RubyPort.hh +++ b/src/mem/ruby/system/RubyPort.hh @@ -167,6 +167,8 @@ class RubyPort : public MemObject uint32_t getId() { return m_version; } DrainState drain() override; + bool isCPUSequencer() { return m_isCPUSequencer; } + protected: void trySendRetries(); void ruby_hit_callback(PacketPtr pkt); @@ -218,6 +220,8 @@ class RubyPort : public MemObject // that should be called when the Sequencer becomes available after a stall. // std::vector retryList; + + bool m_isCPUSequencer; }; #endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__ diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc index 1ecd2e098..e1717e519 100644 --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, Sequencer* sequencer_ptr = NULL; for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) { - sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer()); + sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer()); if (sequencer_ptr == NULL) { sequencer_ptr = sequencer_map[cntrl]; } diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript index 8c5077362..b67311bca 100644 --- a/src/mem/ruby/system/SConscript +++ b/src/mem/ruby/system/SConscript @@ -33,12 +33,22 @@ Import('*') if env['PROTOCOL'] == 'None': Return() +if env['BUILD_GPU']: + SimObject('GPUCoalescer.py') SimObject('RubySystem.py') SimObject('Sequencer.py') +SimObject('WeightedLRUReplacementPolicy.py') +if env['BUILD_GPU']: + SimObject('VIPERCoalescer.py') Source('CacheRecorder.cc') Source('DMASequencer.cc') +if env['BUILD_GPU']: + Source('GPUCoalescer.cc') Source('RubyPort.cc') Source('RubyPortProxy.cc') Source('RubySystem.cc') Source('Sequencer.cc') +if env['BUILD_GPU']: + Source('VIPERCoalescer.cc') +Source('WeightedLRUPolicy.cc') diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 50418c700..c2727b41d 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p) m_max_outstanding_requests = p->max_outstanding_requests; m_deadlock_threshold = p->deadlock_threshold; + m_coreId = p->coreid; // for tracking the two CorePair sequencers assert(m_max_outstanding_requests > 0); assert(m_deadlock_threshold > 0); assert(m_instCache_ptr != NULL); @@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) ContextID proc_id = pkt->req->hasContextId() ? pkt->req->contextId() : InvalidContextID; + ContextID core_id = coreId(); + // If valid, copy the pc to the ruby request Addr pc = 0; if (pkt->req->hasPC()) { @@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) nullptr : pkt->getPtr(), pkt->getSize(), pc, secondary_type, RubyAccessMode_Supervisor, pkt, - PrefetchBit_No, proc_id); + PrefetchBit_No, proc_id, core_id); DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n", curTick(), m_version, "Seq", "Begin", "", "", diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index 47af7ea1e..2a2f49587 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -99,6 +99,7 @@ class Sequencer : public RubyPort void markRemoved(); void evictionCallback(Addr address); void invalidateSC(Addr address); + int coreId() const { return m_coreId; } void recordRequestType(SequencerRequestType requestType); Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } @@ -198,6 +199,8 @@ class Sequencer : public RubyPort Stats::Scalar m_load_waiting_on_store; Stats::Scalar m_load_waiting_on_load; + int m_coreId; + bool m_usingNetworkTester; //! Histogram for number of outstanding requests per cycle. diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py index 7c90eb29c..d6ee0aa2f 100644 --- a/src/mem/ruby/system/Sequencer.py +++ b/src/mem/ruby/system/Sequencer.py @@ -32,54 +32,58 @@ from m5.proxy import * from MemObject import MemObject class RubyPort(MemObject): - type = 'RubyPort' - abstract = True - cxx_header = "mem/ruby/system/RubyPort.hh" - version = Param.Int(0, "") + type = 'RubyPort' + abstract = True + cxx_header = "mem/ruby/system/RubyPort.hh" + version = Param.Int(0, "") - slave = VectorSlavePort("CPU slave port") - master = VectorMasterPort("CPU master port") - pio_master_port = MasterPort("Ruby mem master port") - mem_master_port = MasterPort("Ruby mem master port") - pio_slave_port = SlavePort("Ruby pio slave port") - mem_slave_port = SlavePort("Ruby memory port") + slave = VectorSlavePort("CPU slave port") + master = VectorMasterPort("CPU master port") + pio_master_port = MasterPort("Ruby mem master port") + mem_master_port = MasterPort("Ruby mem master port") + pio_slave_port = SlavePort("Ruby pio slave port") + mem_slave_port = SlavePort("Ruby memory port") - using_ruby_tester = Param.Bool(False, "") - no_retry_on_stall = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") - system = Param.System(Parent.any, "system object") - support_data_reqs = Param.Bool(True, "data cache requests supported") - support_inst_reqs = Param.Bool(True, "inst cache requests supported") + using_ruby_tester = Param.Bool(False, "") + no_retry_on_stall = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "") + system = Param.System(Parent.any, "system object") + support_data_reqs = Param.Bool(True, "data cache requests supported") + support_inst_reqs = Param.Bool(True, "inst cache requests supported") + is_cpu_sequencer = Param.Bool(True, "connected to a cpu") class RubyPortProxy(RubyPort): - type = 'RubyPortProxy' - cxx_header = "mem/ruby/system/RubyPortProxy.hh" + type = 'RubyPortProxy' + cxx_header = "mem/ruby/system/RubyPortProxy.hh" class RubySequencer(RubyPort): - type = 'RubySequencer' - cxx_class = 'Sequencer' - cxx_header = "mem/ruby/system/Sequencer.hh" + type = 'RubySequencer' + cxx_class = 'Sequencer' + cxx_header = "mem/ruby/system/Sequencer.hh" - icache = Param.RubyCache("") - dcache = Param.RubyCache("") - # Cache latencies currently assessed at the beginning of each access - # NOTE: Setting these values to a value greater than one will result in - # O3 CPU pipeline bubbles and negatively impact performance - # TODO: Latencies should be migrated into each top-level cache controller - icache_hit_latency = Param.Cycles(1, "Inst cache hit latency") - dcache_hit_latency = Param.Cycles(1, "Data cache hit latency") - max_outstanding_requests = Param.Int(16, - "max requests (incl. prefetches) outstanding") - deadlock_threshold = Param.Cycles(500000, - "max outstanding cycles for a request before deadlock/livelock declared") - using_network_tester = Param.Bool(False, "") + icache = Param.RubyCache("") + dcache = Param.RubyCache("") + # Cache latencies currently assessed at the beginning of each access + # NOTE: Setting these values to a value greater than one will result in + # O3 CPU pipeline bubbles and negatively impact performance + # TODO: Latencies should be migrated into each top-level cache controller + icache_hit_latency = Param.Cycles(1, "Inst cache hit latency") + dcache_hit_latency = Param.Cycles(1, "Data cache hit latency") + max_outstanding_requests = Param.Int(16, + "max requests (incl. prefetches) outstanding") + deadlock_threshold = Param.Cycles(500000, + "max outstanding cycles for a request before deadlock/livelock declared") + using_network_tester = Param.Bool(False, "") + # id used by protocols that support multiple sequencers per controller + # 99 is the dummy default value + coreid = Param.Int(99, "CorePair core id") class DMASequencer(MemObject): - type = 'DMASequencer' - cxx_header = "mem/ruby/system/DMASequencer.hh" + type = 'DMASequencer' + cxx_header = "mem/ruby/system/DMASequencer.hh" - version = Param.Int(0, "") - slave = SlavePort("Device slave port") - using_ruby_tester = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") - system = Param.System(Parent.any, "system object") + version = Param.Int(0, "") + slave = SlavePort("Device slave port") + using_ruby_tester = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "") + system = Param.System(Parent.any, "system object") diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc new file mode 100644 index 000000000..ca91f2723 --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/VIPERCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/GPUCoalescer.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/VIPERCoalescer.hh" + +using namespace std; + +VIPERCoalescer * +VIPERCoalescerParams::create() +{ + return new VIPERCoalescer(this); +} + +VIPERCoalescer::VIPERCoalescer(const Params *p) + : GPUCoalescer(p) +{ + m_max_wb_per_cycle=p->max_wb_per_cycle; + m_max_inv_per_cycle=p->max_inv_per_cycle; + m_outstanding_inv = 0; + m_outstanding_wb = 0; +} + +VIPERCoalescer::~VIPERCoalescer() +{ +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +VIPERCoalescer::makeRequest(PacketPtr pkt) +{ + if (m_outstanding_wb | m_outstanding_inv) { + DPRINTF(GPUCoalescer, + "There are %d Writebacks and %d Invalidatons\n", + m_outstanding_wb, m_outstanding_inv); + } + // Are we in the middle of a release + if ((m_outstanding_wb) > 0) { + if (pkt->req->isKernel()) { + // Everythign is fine + // Barriers and Kernel End scan coalesce + // If it is a Kerenl Begin flush the cache + if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) { + invL1(); + } + + if (pkt->req->isRelease()) { + insertKernel(pkt->req->contextId(), pkt); + } + + return RequestStatus_Issued; + } +// return RequestStatus_Aliased; + } else if (pkt->req->isKernel() && pkt->req->isRelease()) { + // Flush Dirty Data on Kernel End + // isKernel + isRelease + insertKernel(pkt->req->contextId(), pkt); + wbL1(); + if(m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + return RequestStatus_Issued; + } + RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt); + if (requestStatus!=RequestStatus_Issued) { + // Request not isssued + // enqueue Retry + DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n"); + return requestStatus; + } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { + // Invalidate clean Data on Kernel Begin + // isKernel + isAcquire + invL1(); + } else if (pkt->req->isAcquire() && pkt->req->isRelease()) { + // Deschedule the AtomicAcqRel and + // Flush and Invalidate the L1 cache + invwbL1(); + if (m_outstanding_wb > 0 && issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); + deschedule(issueEvent); + } + } else if (pkt->req->isRelease()) { + // Deschedule the StoreRel and + // Flush the L1 cache + wbL1(); + if (m_outstanding_wb > 0 && issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); + deschedule(issueEvent); + } + } else if (pkt->req->isAcquire()) { + // LoadAcq or AtomicAcq + // Invalidate the L1 cache + invL1(); + } + // Request was successful + if (m_outstanding_wb == 0) { + if (!issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n"); + schedule(issueEvent, curTick()); + } + } + return RequestStatus_Issued; +} + +void +VIPERCoalescer::wbCallback(Addr addr) +{ + m_outstanding_wb--; + // if L1 Flush Complete + // attemnpt to schedule issueEvent + assert(((int) m_outstanding_wb) >= 0); + if (m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + trySendRetries(); +} + +void +VIPERCoalescer::invCallback(Addr addr) +{ + m_outstanding_inv--; + // if L1 Flush Complete + // attemnpt to schedule issueEvent + // This probably won't happen, since + // we dont wait on cache invalidations + if (m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + trySendRetries(); +} + +/** + * Invalidate L1 cache (Acquire) + */ +void +VIPERCoalescer::invL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Invalidations outstanding before Cache Walk\n", + m_outstanding_inv); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + std::shared_ptr msg = std::make_shared( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_inv++; + } + DPRINTF(GPUCoalescer, + "There are %d Invalidatons outstanding after Cache Walk\n", + m_outstanding_inv); +} + +/** + * Writeback L1 cache (Release) + */ +void +VIPERCoalescer::wbL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Writebacks outstanding before Cache Walk\n", + m_outstanding_wb); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Write dirty data back + std::shared_ptr msg = std::make_shared( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_FLUSH, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_wb++; + } + DPRINTF(GPUCoalescer, + "There are %d Writebacks outstanding after Cache Walk\n", + m_outstanding_wb); +} + +/** + * Invalidate and Writeback L1 cache (Acquire&Release) + */ +void +VIPERCoalescer::invwbL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + // Walk the cache + for(int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + std::shared_ptr msg = std::make_shared( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_inv++; + } + // Walk the cache + for(int i = 0; i< size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Write dirty data back + std::shared_ptr msg = std::make_shared( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_FLUSH, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_wb++; + } +} diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh new file mode 100644 index 000000000..af6e44e7f --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ +#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ + +#include + +#include "mem/protocol/PrefetchBit.hh" +#include "mem/protocol/RubyAccessMode.hh" +#include "mem/protocol/RubyRequestType.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/common/Consumer.hh" +#include "mem/ruby/system/GPUCoalescer.hh" +#include "mem/ruby/system/RubyPort.hh" + +class DataBlock; +class CacheMsg; +class MachineID; +class CacheMemory; + +class VIPERCoalescerParams; + +class VIPERCoalescer : public GPUCoalescer +{ + public: + typedef VIPERCoalescerParams Params; + VIPERCoalescer(const Params *); + ~VIPERCoalescer(); + void wbCallback(Addr address); + void invCallback(Addr address); + RequestStatus makeRequest(PacketPtr pkt); + private: + void invL1(); + void wbL1(); + void invwbL1(); + uint64_t m_outstanding_inv; + uint64_t m_outstanding_wb; + uint64_t m_max_inv_per_cycle; + uint64_t m_max_wb_per_cycle; +}; +#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ + diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py new file mode 100644 index 000000000..05c74386f --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.py @@ -0,0 +1,45 @@ +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Steve Reinhardt +# Brad Beckmann + +from m5.params import * +from m5.proxy import * +from GPUCoalescer import * + +class VIPERCoalescer(RubyGPUCoalescer): + type = 'VIPERCoalescer' + cxx_class = 'VIPERCoalescer' + cxx_header = "mem/ruby/system/VIPERCoalescer.hh" + max_inv_per_cycle = Param.Int(32, "max invalidations per cycle") + max_wb_per_cycle = Param.Int(32, "max writebacks per cycle") + assume_rfo = False diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc new file mode 100644 index 000000000..5baa4d9a5 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUPolicy.cc @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Derek Hower + */ + +#include "mem/ruby/system/WeightedLRUPolicy.hh" + +WeightedLRUPolicy::WeightedLRUPolicy(const Params* p) + : AbstractReplacementPolicy(p), m_cache(p->cache) +{ + m_last_occ_ptr = new int*[m_num_sets]; + for(unsigned i = 0; i < m_num_sets; i++){ + m_last_occ_ptr[i] = new int[m_assoc]; + for(unsigned j = 0; j < m_assoc; j++){ + m_last_occ_ptr[i][j] = 0; + } + } +} + +WeightedLRUPolicy * +WeightedLRUReplacementPolicyParams::create() +{ + return new WeightedLRUPolicy(this); +} + +WeightedLRUPolicy::~WeightedLRUPolicy() +{ + if (m_last_occ_ptr != NULL){ + for (unsigned i = 0; i < m_num_sets; i++){ + if (m_last_occ_ptr[i] != NULL){ + delete[] m_last_occ_ptr[i]; + } + } + delete[] m_last_occ_ptr; + } +} + +void +WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time) +{ + assert(index >= 0 && index < m_assoc); + assert(set >= 0 && set < m_num_sets); + + m_last_ref_ptr[set][index] = time; +} + +void +WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy) +{ + assert(index >= 0 && index < m_assoc); + assert(set >= 0 && set < m_num_sets); + + m_last_ref_ptr[set][index] = time; + m_last_occ_ptr[set][index] = occupancy; +} + +int64_t +WeightedLRUPolicy::getVictim(int64_t set) const +{ + Tick time, smallest_time; + int64_t smallest_index; + + smallest_index = 0; + smallest_time = m_last_ref_ptr[set][0]; + int smallest_weight = m_last_ref_ptr[set][0]; + + for (unsigned i = 1; i < m_assoc; i++) { + + int weight = m_last_occ_ptr[set][i]; + if (weight < smallest_weight) { + smallest_weight = weight; + smallest_index = i; + smallest_time = m_last_ref_ptr[set][i]; + } else if (weight == smallest_weight) { + time = m_last_ref_ptr[set][i]; + if (time < smallest_time) { + smallest_index = i; + smallest_time = time; + } + } + } + return smallest_index; +} diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh new file mode 100644 index 000000000..3150779b2 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUPolicy.hh @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__ +#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__ + +#include "mem/ruby/structures/AbstractReplacementPolicy.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "params/WeightedLRUReplacementPolicy.hh" + +/* Simple true LRU replacement policy */ + +class WeightedLRUPolicy : public AbstractReplacementPolicy +{ + public: + typedef WeightedLRUReplacementPolicyParams Params; + WeightedLRUPolicy(const Params* p); + ~WeightedLRUPolicy(); + + void touch(int64_t set, int64_t way, Tick time); + void touch(int64_t set, int64_t way, Tick time, int occupancy); + int64_t getVictim(int64_t set) const override; + + bool useOccupancy() const { return true; } + + CacheMemory * m_cache; + int **m_last_occ_ptr; +}; + +#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__ diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py new file mode 100644 index 000000000..e7de33496 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2013-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Derek Hower +# + +from m5.params import * +from m5.proxy import * +from MemObject import MemObject +from ReplacementPolicy import ReplacementPolicy + +class WeightedLRUReplacementPolicy(ReplacementPolicy): + type = "WeightedLRUReplacementPolicy" + cxx_class = "WeightedLRUPolicy" + cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh" + cache = Param.RubyCache("") diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py index a530307ee..fc3f32c3d 100644 --- a/src/mem/slicc/symbols/StateMachine.py +++ b/src/mem/slicc/symbols/StateMachine.py @@ -35,13 +35,17 @@ import re python_class_map = { "int": "Int", + "NodeID": "Int", "uint32_t" : "UInt32", "std::string": "String", "bool": "Bool", "CacheMemory": "RubyCache", "WireBuffer": "RubyWireBuffer", "Sequencer": "RubySequencer", + "GPUCoalescer" : "RubyGPUCoalescer", + "VIPERCoalescer" : "VIPERCoalescer", "DirectoryMemory": "RubyDirectoryMemory", + "PerfectCacheMemory": "RubyPerfectCacheMemory", "MemoryControl": "MemoryControl", "MessageBuffer": "MessageBuffer", "DMASequencer": "DMASequencer", @@ -305,7 +309,7 @@ class $c_ident : public AbstractController void collateStats(); void recordCacheTrace(int cntrl, CacheRecorder* tr); - Sequencer* getSequencer() const; + Sequencer* getCPUSequencer() const; int functionalWriteBuffers(PacketPtr&); @@ -527,8 +531,14 @@ $c_ident::$c_ident(const Params *p) else: code('m_${{param.ident}} = p->${{param.ident}};') - if re.compile("sequencer").search(param.ident): - code('m_${{param.ident}}_ptr->setController(this);') + if re.compile("sequencer").search(param.ident) or \ + param.type_ast.type.c_ident == "GPUCoalescer" or \ + param.type_ast.type.c_ident == "VIPERCoalescer": + code(''' +if (m_${{param.ident}}_ptr != NULL) { + m_${{param.ident}}_ptr->setController(this); +} +''') code(''' @@ -670,6 +680,28 @@ $c_ident::init() assert(param.pointer) seq_ident = "m_%s_ptr" % param.ident + if seq_ident != "NULL": + code(''' +Sequencer* +$c_ident::getCPUSequencer() const +{ + if (NULL != $seq_ident && $seq_ident->isCPUSequencer()) { + return $seq_ident; + } else { + return NULL; + } +} +''') + else: + code(''' + +Sequencer* +$c_ident::getCPUSequencer() const +{ + return NULL; +} +''') + code(''' void @@ -796,12 +828,6 @@ $c_ident::getMemoryQueue() const return $memq_ident; } -Sequencer* -$c_ident::getSequencer() const -{ - return $seq_ident; -} - void $c_ident::print(ostream& out) const { diff --git a/tests/SConscript b/tests/SConscript index e9c9a6432..886b7fe59 100644 --- a/tests/SConscript +++ b/tests/SConscript @@ -348,20 +348,26 @@ if env['TARGET_ISA'] == 'arm': 'realview64-switcheroo-timing', 'realview64-switcheroo-o3', 'realview64-switcheroo-full'] -if env['TARGET_ISA'] == 'x86': +if env['TARGET_ISA'] == 'x86' and not env['BUILD_GPU']: configs += ['pc-simple-atomic', 'pc-simple-timing', 'pc-o3-timing', 'pc-switcheroo-full'] -configs += ['simple-atomic', 'simple-atomic-mp', - 'simple-timing', 'simple-timing-mp', - 'minor-timing', 'minor-timing-mp', - 'o3-timing', 'o3-timing-mt', 'o3-timing-mp', - 'rubytest', 'memtest', 'memtest-filter', - 'tgen-simple-mem', 'tgen-dram-ctrl'] - -configs += ['learning-gem5-p1-simple', 'learning-gem5-p1-two-level'] +if env['TARGET_ISA'] == 'x86' and env['BUILD_GPU'] and \ + env['TARGET_GPU_ISA'] == 'hsail': + configs += ['gpu'] + if env['PROTOCOL'] == 'GPU_RfO': + configs += ['gpu-randomtest'] +else: + configs += ['simple-atomic', 'simple-atomic-mp', + 'simple-timing', 'simple-timing-mp', + 'minor-timing', 'minor-timing-mp', + 'o3-timing', 'o3-timing-mt', 'o3-timing-mp', + 'rubytest', 'memtest', 'memtest-filter', + 'tgen-simple-mem', 'tgen-dram-ctrl'] + + configs += ['learning-gem5-p1-simple', 'learning-gem5-p1-two-level'] if env['PROTOCOL'] != 'None': if env['PROTOCOL'] == 'MI_example': diff --git a/tests/configs/gpu-randomtest-ruby.py b/tests/configs/gpu-randomtest-ruby.py new file mode 100644 index 000000000..92e300394 --- /dev/null +++ b/tests/configs/gpu-randomtest-ruby.py @@ -0,0 +1,151 @@ +# +# Copyright (c) 2010-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Brad Beckmann +# + +import m5 +from m5.objects import * +from m5.defines import buildEnv +from m5.util import addToPath +import os, optparse, sys + +# Get paths we might need. It's expected this file is in m5/configs/example. +config_path = os.path.dirname(os.path.abspath(__file__)) +config_root = os.path.dirname(config_path) +m5_root = os.path.dirname(config_root) +addToPath(config_root+'/configs/common') +addToPath(config_root+'/configs/ruby') +addToPath(config_root+'/configs/topologies') + +import Ruby +import Options + +parser = optparse.OptionParser() +Options.addCommonOptions(parser) + +# add the gpu specific options expected by the the gpu and gpu_RfO +parser.add_option("-u", "--num-compute-units", type="int", default=8, + help="number of compute units in the GPU") +parser.add_option("--numCPs", type="int", default=0, + help="Number of GPU Command Processors (CP)") +parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ + "per CU") +parser.add_option("--wf-size", type="int", default=64, + help="Wavefront size(in workitems)") +parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \ + "WF slots per SIMD") + +# Add the ruby specific and protocol specific options +Ruby.define_options(parser) + +(options, args) = parser.parse_args() + +# +# Set the default cache size and associativity to be very small to encourage +# races between requests and writebacks. +# +options.l1d_size="256B" +options.l1i_size="256B" +options.l2_size="512B" +options.l3_size="1kB" +options.l1d_assoc=2 +options.l1i_assoc=2 +options.l2_assoc=2 +options.l3_assoc=2 +options.num_compute_units=8 +options.num_sqc=2 + +# Check to for the GPU_RfO protocol. Other GPU protocols are non-SC and will +# not work with the Ruby random tester. +assert(buildEnv['PROTOCOL'] == 'GPU_RfO') + +# +# create the tester and system, including ruby +# +tester = RubyTester(check_flush = False, checks_to_complete = 100, + wakeup_frequency = 10, num_cpus = options.num_cpus) + +# We set the testers as cpu for ruby to find the correct clock domains +# for the L1 Objects. +system = System(cpu = tester) + +# Dummy voltage domain for all our clock domains +system.voltage_domain = VoltageDomain(voltage = options.sys_voltage) +system.clk_domain = SrcClockDomain(clock = '1GHz', + voltage_domain = system.voltage_domain) + +system.mem_ranges = AddrRange('256MB') + +Ruby.create_system(options, False, system) + +# Create a separate clock domain for Ruby +system.ruby.clk_domain = SrcClockDomain(clock = '1GHz', + voltage_domain = system.voltage_domain) + +tester.num_cpus = len(system.ruby._cpu_ports) + +# +# The tester is most effective when randomization is turned on and +# artifical delay is randomly inserted on messages +# +system.ruby.randomization = True + +for ruby_port in system.ruby._cpu_ports: + # + # Tie the ruby tester ports to the ruby cpu read and write ports + # + if ruby_port.support_data_reqs and ruby_port.support_inst_reqs: + tester.cpuInstDataPort = ruby_port.slave + elif ruby_port.support_data_reqs: + tester.cpuDataPort = ruby_port.slave + elif ruby_port.support_inst_reqs: + tester.cpuInstPort = ruby_port.slave + + # Do not automatically retry stalled Ruby requests + ruby_port.no_retry_on_stall = True + + # + # Tell the sequencer this is the ruby tester so that it + # copies the subblock back to the checker + # + ruby_port.using_ruby_tester = True + +# ----------------------- +# run simulation +# ----------------------- + +root = Root(full_system = False, system = system ) +root.system.mem_mode = 'timing' + +# Not much point in this being higher than the L1 latency +m5.ticks.setGlobalFrequency('1ns') diff --git a/tests/configs/gpu-ruby.py b/tests/configs/gpu-ruby.py new file mode 100644 index 000000000..632b4dec0 --- /dev/null +++ b/tests/configs/gpu-ruby.py @@ -0,0 +1,353 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Brad Beckmann +# + +import m5 +from m5.objects import * +from m5.defines import buildEnv +from m5.util import addToPath +import os, optparse, sys, math, glob + +# Get paths we might need +config_path = os.path.dirname(os.path.abspath(__file__)) +config_root = os.path.dirname(config_path) +addToPath(config_root+'/configs/common') +addToPath(config_root+'/configs/ruby') +addToPath(config_root+'/configs/topologies') + +import Ruby +import Options +import GPUTLBOptions, GPUTLBConfig + +########################## Script Options ######################## +def setOption(parser, opt_str, value = 1): + # check to make sure the option actually exists + if not parser.has_option(opt_str): + raise Exception("cannot find %s in list of possible options" % opt_str) + + opt = parser.get_option(opt_str) + # set the value + exec("parser.values.%s = %s" % (opt.dest, value)) + +def getOption(parser, opt_str): + # check to make sure the option actually exists + if not parser.has_option(opt_str): + raise Exception("cannot find %s in list of possible options" % opt_str) + + opt = parser.get_option(opt_str) + # get the value + exec("return_value = parser.values.%s" % opt.dest) + return return_value + +def run_test(root): + """gpu test requires a specialized run_test implementation to set up the + mmio space.""" + + # instantiate configuration + m5.instantiate() + + # Now that the system has been constructed, setup the mmio space + root.system.cpu[0].workload[0].map(0x10000000, 0x200000000, 4096) + + # simulate until program terminates + exit_event = m5.simulate(maxtick) + print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause() + +parser = optparse.OptionParser() +Options.addCommonOptions(parser) +Options.addSEOptions(parser) + +parser.add_option("-k", "--kernel-files", + help="file(s) containing GPU kernel code (colon separated)") +parser.add_option("-u", "--num-compute-units", type="int", default=2, + help="number of GPU compute units"), +parser.add_option("--numCPs", type="int", default=0, + help="Number of GPU Command Processors (CP)") +parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ + "per CU") +parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \ + "sharing an SQC (icache, and thus icache TLB)") +parser.add_option("--wf-size", type="int", default=64, + help="Wavefront size(in workitems)") +parser.add_option("--wfs-per-simd", type="int", default=8, help="Number of " \ + "WF slots per SIMD") +parser.add_option("--sp-bypass-path-length", type="int", default=4, \ + help="Number of stages of bypass path in vector ALU for Single "\ + "Precision ops") +parser.add_option("--dp-bypass-path-length", type="int", default=4, \ + help="Number of stages of bypass path in vector ALU for Double "\ + "Precision ops") +parser.add_option("--issue-period", type="int", default=4, \ + help="Number of cycles per vector instruction issue period") +parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \ + help="VGPR to Coalescer (Global Memory) data bus width in bytes") +parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \ + help="Coalescer to VGPR (Global Memory) data bus width in bytes") +parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \ + help="Number of Shared Memory pipelines per CU") +parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \ + help="Number of Global Memory pipelines per CU") +parser.add_option("--vreg-file-size", type="int", default=2048, + help="number of physical vector registers per SIMD") +parser.add_option("--bw-scalor", type="int", default=0, + help="bandwidth scalor for scalability analysis") +parser.add_option("--CPUClock", type="string", default="2GHz", + help="CPU clock") +parser.add_option("--GPUClock", type="string", default="1GHz", + help="GPU clock") +parser.add_option("--cpu-voltage", action="store", type="string", + default='1.0V', + help = """CPU voltage domain""") +parser.add_option("--gpu-voltage", action="store", type="string", + default='1.0V', + help = """CPU voltage domain""") +parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST", + help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)") +parser.add_option("--xact-cas-mode", action="store_true", + help="enable load_compare mode (transactional CAS)") +parser.add_option("--SegFaultDebug",action="store_true", + help="checks for GPU seg fault before TLB access") +parser.add_option("--LocalMemBarrier",action="store_true", + help="Barrier does not wait for writethroughs to complete") +parser.add_option("--countPages", action="store_true", + help="Count Page Accesses and output in per-CU output files") +parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\ + "TLBs") +parser.add_option("--pf-type", type="string", help="type of prefetch: "\ + "PF_CU, PF_WF, PF_PHASE, PF_STRIDE") +parser.add_option("--pf-stride", type="int", help="set prefetch stride") +parser.add_option("--numLdsBanks", type="int", default=32, + help="number of physical banks per LDS module") +parser.add_option("--ldsBankConflictPenalty", type="int", default=1, + help="number of cycles per LDS bank conflict") + +# Add the ruby specific and protocol specific options +Ruby.define_options(parser) + +GPUTLBOptions.tlb_options(parser) + +(options, args) = parser.parse_args() + +# The GPU cache coherence protocols only work with the backing store +setOption(parser, "--access-backing-store") + +# Currently, the sqc (I-Cache of GPU) is shared by +# multiple compute units(CUs). The protocol works just fine +# even if sqc is not shared. Overriding this option here +# so that the user need not explicitly set this (assuming +# sharing sqc is the common usage) +n_cu = options.num_compute_units +num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc)) +options.num_sqc = num_sqc # pass this to Ruby + +########################## Creating the GPU system ######################## +# shader is the GPU +shader = Shader(n_wf = options.wfs_per_simd, + clk_domain = SrcClockDomain( + clock = options.GPUClock, + voltage_domain = VoltageDomain( + voltage = options.gpu_voltage)), + timing = True) + +# GPU_RfO(Read For Ownership) implements SC/TSO memory model. +# Other GPU protocols implement release consistency at GPU side. +# So, all GPU protocols other than GPU_RfO should make their writes +# visible to the global memory and should read from global memory +# during kernal boundary. The pipeline initiates(or do not initiate) +# the acquire/release operation depending on this impl_kern_boundary_sync +# flag. This flag=true means pipeline initiates a acquire/release operation +# at kernel boundary. +if buildEnv['PROTOCOL'] == 'GPU_RfO': + shader.impl_kern_boundary_sync = False +else: + shader.impl_kern_boundary_sync = True + +# Switching off per-lane TLB by default +per_lane = False +if options.TLB_config == "perLane": + per_lane = True + +# List of compute units; one GPU can have multiple compute units +compute_units = [] +for i in xrange(n_cu): + compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, + num_SIMDs = options.simds_per_cu, + wfSize = options.wf_size, + spbypass_pipe_length = \ + options.sp_bypass_path_length, + dpbypass_pipe_length = \ + options.dp_bypass_path_length, + issue_period = options.issue_period, + coalescer_to_vrf_bus_width = \ + options.glbmem_rd_bus_width, + vrf_to_coalescer_bus_width = \ + options.glbmem_wr_bus_width, + num_global_mem_pipes = \ + options.glb_mem_pipes_per_cu, + num_shared_mem_pipes = \ + options.shr_mem_pipes_per_cu, + n_wf = options.wfs_per_simd, + execPolicy = options.CUExecPolicy, + xactCasMode = options.xact_cas_mode, + debugSegFault = options.SegFaultDebug, + functionalTLB = True, + localMemBarrier = options.LocalMemBarrier, + countPages = options.countPages, + localDataStore = \ + LdsState(banks = options.numLdsBanks, + bankConflictPenalty = \ + options.ldsBankConflictPenalty))) + wavefronts = [] + vrfs = [] + for j in xrange(options.simds_per_cu): + for k in xrange(shader.n_wf): + wavefronts.append(Wavefront(simdId = j, wf_slot_id = k)) + vrfs.append(VectorRegisterFile(simd_id=j, + num_regs_per_simd=options.vreg_file_size)) + compute_units[-1].wavefronts = wavefronts + compute_units[-1].vector_register_file = vrfs + if options.TLB_prefetch: + compute_units[-1].prefetch_depth = options.TLB_prefetch + compute_units[-1].prefetch_prev_type = options.pf_type + + # attach the LDS and the CU to the bus (actually a Bridge) + compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave + compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort + +# Attach compute units to GPU +shader.CUs = compute_units + +# this is a uniprocessor only test, thus the shader is the second index in the +# list of "system.cpus" +options.num_cpus = 1 +shader_idx = 1 +cpu = TimingSimpleCPU(cpu_id=0) + +########################## Creating the GPU dispatcher ######################## +# Dispatcher dispatches work from host CPU to GPU +host_cpu = cpu +dispatcher = GpuDispatcher() + +# Currently does not test for command processors +cpu_list = [cpu] + [shader] + [dispatcher] + +system = System(cpu = cpu_list, + mem_ranges = [AddrRange(options.mem_size)], + mem_mode = 'timing') + +# Dummy voltage domain for all our clock domains +system.voltage_domain = VoltageDomain(voltage = options.sys_voltage) +system.clk_domain = SrcClockDomain(clock = '1GHz', + voltage_domain = system.voltage_domain) + +# Create a seperate clock domain for components that should run at +# CPUs frequency +system.cpu[0].clk_domain = SrcClockDomain(clock = '2GHz', + voltage_domain = \ + system.voltage_domain) + +# configure the TLB hierarchy +GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) + +# create Ruby system +system.piobus = IOXBar(width=32, response_latency=0, + frontend_latency=0, forward_latency=0) +Ruby.create_system(options, None, system) + +# Create a separate clock for Ruby +system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock, + voltage_domain = system.voltage_domain) + +# create the interrupt controller +cpu.createInterruptController() + +# +# Tie the cpu cache ports to the ruby cpu ports and +# physmem, respectively +# +cpu.connectAllPorts(system.ruby._cpu_ports[0]) +system.ruby._cpu_ports[0].mem_master_port = system.piobus.slave + +# attach CU ports to Ruby +# Because of the peculiarities of the CP core, you may have 1 CPU but 2 +# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be +# hooked up until after the CP. To make this script generic, figure out +# the index as below, but note that this assumes there is one sequencer +# per compute unit and one sequencer per SQC for the math to work out +# correctly. +gpu_port_idx = len(system.ruby._cpu_ports) \ + - options.num_compute_units - options.num_sqc +gpu_port_idx = gpu_port_idx - options.numCPs * 2 + +wavefront_size = options.wf_size +for i in xrange(n_cu): + # The pipeline issues wavefront_size number of uncoalesced requests + # in one GPU issue cycle. Hence wavefront_size mem ports. + for j in xrange(wavefront_size): + system.cpu[shader_idx].CUs[i].memory_port[j] = \ + system.ruby._cpu_ports[gpu_port_idx].slave[j] + gpu_port_idx += 1 + +for i in xrange(n_cu): + if i > 0 and not i % options.cu_per_sqc: + gpu_port_idx += 1 + system.cpu[shader_idx].CUs[i].sqc_port = \ + system.ruby._cpu_ports[gpu_port_idx].slave +gpu_port_idx = gpu_port_idx + 1 + +assert(options.numCPs == 0) + +# connect dispatcher to the system.piobus +dispatcher.pio = system.piobus.master +dispatcher.dma = system.piobus.slave + +################# Connect the CPU and GPU via GPU Dispatcher ################### +# CPU rings the GPU doorbell to notify a pending task +# using this interface. +# And GPU uses this interface to notify the CPU of task completion +# The communcation happens through emulated driver. + +# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array +# parameters must be after the explicit setting of the System cpu list +shader.cpu_pointer = host_cpu +dispatcher.cpu = host_cpu +dispatcher.shader_pointer = shader + +# ----------------------- +# run simulation +# ----------------------- + +root = Root(full_system = False, system = system) +m5.ticks.setGlobalFrequency('1THz') +root.system.mem_mode = 'timing' diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/config.ini new file mode 100644 index 000000000..5486af826 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/config.ini @@ -0,0 +1,4423 @@ +[root] +type=Root +children=system +eventq_index=0 +full_system=false +sim_quantum=0 +time_sync_enable=false +time_sync_period=100000000000 +time_sync_spin_threshold=100000000 + +[system] +type=System +children=clk_domain cp_cntrl0 cpu0 cpu1 cpu2 dir_cntrl0 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus ruby sqc_cntrl0 sqc_coalescer sqc_tlb sys_port_proxy tcc_cntrl0 tccdir_cntrl0 tcp_cntrl0 tcp_cntrl1 voltage_domain +boot_osflags=a +cache_line_size=64 +clk_domain=system.clk_domain +eventq_index=0 +exit_on_work_items=false +init_param=0 +kernel= +kernel_addr_check=true +load_addr_mask=1099511627775 +load_offset=0 +mem_mode=timing +mem_ranges=0:536870911 +memories=system.mem_ctrls system.ruby.phys_mem +mmap_using_noreserve=false +multi_thread=false +num_work_ids=16 +readfile= +symbolfile= +work_begin_ckpt_count=0 +work_begin_cpu_id_exit=-1 +work_begin_exit_count=0 +work_cpus_ckpt_count=0 +work_end_ckpt_count=0 +work_end_exit_count=0 +work_item_id=-1 +system_port=system.sys_port_proxy.slave[0] + +[system.clk_domain] +type=SrcClockDomain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cp_cntrl0] +type=CorePair_Controller +children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore +L1D0cache=system.cp_cntrl0.L1D0cache +L1D1cache=system.cp_cntrl0.L1D1cache +L1Icache=system.cp_cntrl0.L1Icache +L2cache=system.cp_cntrl0.L2cache +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=15 +l2_hit_latency=18 +mandatoryQueue=system.cp_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToCore=system.cp_cntrl0.probeToCore +recycle_latency=10 +requestFromCore=system.cp_cntrl0.requestFromCore +responseFromCore=system.cp_cntrl0.responseFromCore +responseToCore=system.cp_cntrl0.responseToCore +ruby_system=system.ruby +send_evictions=true +sequencer=system.cp_cntrl0.sequencer +sequencer1=system.cp_cntrl0.sequencer1 +system=system +transitions_per_cycle=32 +triggerQueue=system.cp_cntrl0.triggerQueue +unblockFromCore=system.cp_cntrl0.unblockFromCore +version=0 + +[system.cp_cntrl0.L1D0cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L1D0cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.cp_cntrl0.L1D1cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L1D1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.cp_cntrl0.L1Icache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L1Icache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=32768 + +[system.cp_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L2cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=2097152 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=2097152 + +[system.cp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.cp_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[3] + +[system.cp_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[2] + +[system.cp_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[3] + +[system.cp_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[4] + +[system.cp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=0 +dcache=system.cp_cntrl0.L1D0cache +dcache_hit_latency=2 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=2 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=0 +master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave +mem_master_port=system.piobus.slave[0] +slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master + +[system.cp_cntrl0.sequencer1] +type=RubySequencer +clk_domain=system.clk_domain +coreid=1 +dcache=system.cp_cntrl0.L1D1cache +dcache_hit_latency=2 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=2 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=1 + +[system.cp_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.cp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[4] + +[system.cpu0] +type=TimingSimpleCPU +children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload +branchPred=Null +checker=Null +clk_domain=system.cpu0.clk_domain +cpu_id=0 +do_checkpoint_insts=true +do_quiesce=true +do_statistics_insts=true +dtb=system.cpu0.dtb +eventq_index=0 +function_trace=false +function_trace_start=0 +interrupts=system.cpu0.interrupts +isa=system.cpu0.isa +itb=system.cpu0.itb +max_insts_all_threads=0 +max_insts_any_thread=0 +max_loads_all_threads=0 +max_loads_any_thread=0 +numThreads=1 +profile=0 +progress_interval=0 +simpoint_start_insts= +socket_id=0 +switched_out=false +system=system +tracer=system.cpu0.tracer +workload=system.cpu0.workload +dcache_port=system.cp_cntrl0.sequencer.slave[1] +icache_port=system.cp_cntrl0.sequencer.slave[0] + +[system.cpu0.apic_clk_domain] +type=DerivedClockDomain +clk_divider=16 +clk_domain=system.cpu0.clk_domain +eventq_index=0 + +[system.cpu0.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cpu0.dtb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.dtb.walker + +[system.cpu0.dtb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.cp_cntrl0.sequencer.slave[3] + +[system.cpu0.interrupts] +type=X86LocalApic +clk_domain=system.cpu0.apic_clk_domain +eventq_index=0 +int_latency=1000 +pio_addr=2305843009213693952 +pio_latency=100000 +system=system +int_master=system.cp_cntrl0.sequencer.slave[4] +int_slave=system.cp_cntrl0.sequencer.master[1] +pio=system.cp_cntrl0.sequencer.master[0] + +[system.cpu0.isa] +type=X86ISA +eventq_index=0 + +[system.cpu0.itb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.itb.walker + +[system.cpu0.itb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.cp_cntrl0.sequencer.slave[2] + +[system.cpu0.tracer] +type=ExeTracer +eventq_index=0 + +[system.cpu0.workload] +type=LiveProcess +cmd=gpu-hello +cwd= +drivers=system.cpu2.cl_driver +egid=100 +env= +errout=cerr +euid=100 +eventq_index=0 +executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello +gid=100 +input=cin +kvmInSE=false +max_stack_size=67108864 +output=cout +pid=100 +ppid=99 +simpoint=0 +system=system +uid=100 +useArchPT=false + +[system.cpu1] +type=Shader +children=CUs0 CUs1 clk_domain +CUs=system.cpu1.CUs0 system.cpu1.CUs1 +clk_domain=system.cpu1.clk_domain +cpu_pointer=system.cpu0 +eventq_index=0 +globalmem=65536 +impl_kern_boundary_sync=false +n_wf=8 +separate_acquire_release=false +timing=true +translation=false + +[system.cpu1.CUs0] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=0 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs0.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs0.ldsBus.slave +memory_port=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl0.coalescer.slave[1] system.tcp_cntrl0.coalescer.slave[2] system.tcp_cntrl0.coalescer.slave[3] system.tcp_cntrl0.coalescer.slave[4] system.tcp_cntrl0.coalescer.slave[5] system.tcp_cntrl0.coalescer.slave[6] system.tcp_cntrl0.coalescer.slave[7] system.tcp_cntrl0.coalescer.slave[8] system.tcp_cntrl0.coalescer.slave[9] system.tcp_cntrl0.coalescer.slave[10] system.tcp_cntrl0.coalescer.slave[11] system.tcp_cntrl0.coalescer.slave[12] system.tcp_cntrl0.coalescer.slave[13] system.tcp_cntrl0.coalescer.slave[14] system.tcp_cntrl0.coalescer.slave[15] system.tcp_cntrl0.coalescer.slave[16] system.tcp_cntrl0.coalescer.slave[17] system.tcp_cntrl0.coalescer.slave[18] system.tcp_cntrl0.coalescer.slave[19] system.tcp_cntrl0.coalescer.slave[20] system.tcp_cntrl0.coalescer.slave[21] system.tcp_cntrl0.coalescer.slave[22] system.tcp_cntrl0.coalescer.slave[23] system.tcp_cntrl0.coalescer.slave[24] system.tcp_cntrl0.coalescer.slave[25] system.tcp_cntrl0.coalescer.slave[26] system.tcp_cntrl0.coalescer.slave[27] system.tcp_cntrl0.coalescer.slave[28] system.tcp_cntrl0.coalescer.slave[29] system.tcp_cntrl0.coalescer.slave[30] system.tcp_cntrl0.coalescer.slave[31] system.tcp_cntrl0.coalescer.slave[32] system.tcp_cntrl0.coalescer.slave[33] system.tcp_cntrl0.coalescer.slave[34] system.tcp_cntrl0.coalescer.slave[35] system.tcp_cntrl0.coalescer.slave[36] system.tcp_cntrl0.coalescer.slave[37] system.tcp_cntrl0.coalescer.slave[38] system.tcp_cntrl0.coalescer.slave[39] system.tcp_cntrl0.coalescer.slave[40] system.tcp_cntrl0.coalescer.slave[41] system.tcp_cntrl0.coalescer.slave[42] system.tcp_cntrl0.coalescer.slave[43] system.tcp_cntrl0.coalescer.slave[44] system.tcp_cntrl0.coalescer.slave[45] system.tcp_cntrl0.coalescer.slave[46] system.tcp_cntrl0.coalescer.slave[47] system.tcp_cntrl0.coalescer.slave[48] system.tcp_cntrl0.coalescer.slave[49] system.tcp_cntrl0.coalescer.slave[50] system.tcp_cntrl0.coalescer.slave[51] system.tcp_cntrl0.coalescer.slave[52] system.tcp_cntrl0.coalescer.slave[53] system.tcp_cntrl0.coalescer.slave[54] system.tcp_cntrl0.coalescer.slave[55] system.tcp_cntrl0.coalescer.slave[56] system.tcp_cntrl0.coalescer.slave[57] system.tcp_cntrl0.coalescer.slave[58] system.tcp_cntrl0.coalescer.slave[59] system.tcp_cntrl0.coalescer.slave[60] system.tcp_cntrl0.coalescer.slave[61] system.tcp_cntrl0.coalescer.slave[62] system.tcp_cntrl0.coalescer.slave[63] +sqc_port=system.sqc_cntrl0.sequencer.slave[0] +sqc_tlb_port=system.sqc_coalescer.slave[0] +translation_port=system.l1_coalescer0.slave[0] + +[system.cpu1.CUs0.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs0.localDataStore.cuPort +slave=system.cpu1.CUs0.ldsPort + +[system.cpu1.CUs0.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs0.ldsBus.master + +[system.cpu1.CUs0.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs0.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs0.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs0.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs0.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.CUs1] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=1 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs1.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs1.ldsBus.slave +memory_port=system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[1] system.tcp_cntrl1.coalescer.slave[2] system.tcp_cntrl1.coalescer.slave[3] system.tcp_cntrl1.coalescer.slave[4] system.tcp_cntrl1.coalescer.slave[5] system.tcp_cntrl1.coalescer.slave[6] system.tcp_cntrl1.coalescer.slave[7] system.tcp_cntrl1.coalescer.slave[8] system.tcp_cntrl1.coalescer.slave[9] system.tcp_cntrl1.coalescer.slave[10] system.tcp_cntrl1.coalescer.slave[11] system.tcp_cntrl1.coalescer.slave[12] system.tcp_cntrl1.coalescer.slave[13] system.tcp_cntrl1.coalescer.slave[14] system.tcp_cntrl1.coalescer.slave[15] system.tcp_cntrl1.coalescer.slave[16] system.tcp_cntrl1.coalescer.slave[17] system.tcp_cntrl1.coalescer.slave[18] system.tcp_cntrl1.coalescer.slave[19] system.tcp_cntrl1.coalescer.slave[20] system.tcp_cntrl1.coalescer.slave[21] system.tcp_cntrl1.coalescer.slave[22] system.tcp_cntrl1.coalescer.slave[23] system.tcp_cntrl1.coalescer.slave[24] system.tcp_cntrl1.coalescer.slave[25] system.tcp_cntrl1.coalescer.slave[26] system.tcp_cntrl1.coalescer.slave[27] system.tcp_cntrl1.coalescer.slave[28] system.tcp_cntrl1.coalescer.slave[29] system.tcp_cntrl1.coalescer.slave[30] system.tcp_cntrl1.coalescer.slave[31] system.tcp_cntrl1.coalescer.slave[32] system.tcp_cntrl1.coalescer.slave[33] system.tcp_cntrl1.coalescer.slave[34] system.tcp_cntrl1.coalescer.slave[35] system.tcp_cntrl1.coalescer.slave[36] system.tcp_cntrl1.coalescer.slave[37] system.tcp_cntrl1.coalescer.slave[38] system.tcp_cntrl1.coalescer.slave[39] system.tcp_cntrl1.coalescer.slave[40] system.tcp_cntrl1.coalescer.slave[41] system.tcp_cntrl1.coalescer.slave[42] system.tcp_cntrl1.coalescer.slave[43] system.tcp_cntrl1.coalescer.slave[44] system.tcp_cntrl1.coalescer.slave[45] system.tcp_cntrl1.coalescer.slave[46] system.tcp_cntrl1.coalescer.slave[47] system.tcp_cntrl1.coalescer.slave[48] system.tcp_cntrl1.coalescer.slave[49] system.tcp_cntrl1.coalescer.slave[50] system.tcp_cntrl1.coalescer.slave[51] system.tcp_cntrl1.coalescer.slave[52] system.tcp_cntrl1.coalescer.slave[53] system.tcp_cntrl1.coalescer.slave[54] system.tcp_cntrl1.coalescer.slave[55] system.tcp_cntrl1.coalescer.slave[56] system.tcp_cntrl1.coalescer.slave[57] system.tcp_cntrl1.coalescer.slave[58] system.tcp_cntrl1.coalescer.slave[59] system.tcp_cntrl1.coalescer.slave[60] system.tcp_cntrl1.coalescer.slave[61] system.tcp_cntrl1.coalescer.slave[62] system.tcp_cntrl1.coalescer.slave[63] +sqc_port=system.sqc_cntrl0.sequencer.slave[1] +sqc_tlb_port=system.sqc_coalescer.slave[1] +translation_port=system.l1_coalescer1.slave[0] + +[system.cpu1.CUs1.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs1.localDataStore.cuPort +slave=system.cpu1.CUs1.ldsPort + +[system.cpu1.CUs1.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs1.ldsBus.master + +[system.cpu1.CUs1.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs1.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs1.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs1.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs1.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.cpu1.clk_domain.voltage_domain + +[system.cpu1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.cpu2] +type=GpuDispatcher +children=cl_driver +cl_driver=system.cpu2.cl_driver +clk_domain=system.clk_domain +cpu=system.cpu0 +eventq_index=0 +pio_addr=8589934592 +pio_latency=1000 +shader_pointer=system.cpu1 +system=system +dma=system.piobus.slave[1] +pio=system.piobus.master[0] +translation_port=system.dispatcher_coalescer.slave[0] + +[system.cpu2.cl_driver] +type=ClDriver +codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +eventq_index=0 +filename=hsa + +[system.dir_cntrl0] +type=Directory_Controller +children=L3CacheMemory L3triggerQueue directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores +CPUonly=false +L3CacheMemory=system.dir_cntrl0.L3CacheMemory +L3triggerQueue=system.dir_cntrl0.L3triggerQueue +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +directory=system.dir_cntrl0.directory +eventq_index=0 +l3_hit_latency=15 +noTCCdir=false +number_of_TBEs=5120 +probeToCore=system.dir_cntrl0.probeToCore +recycle_latency=10 +requestFromCores=system.dir_cntrl0.requestFromCores +responseFromCores=system.dir_cntrl0.responseFromCores +responseFromMemory=system.dir_cntrl0.responseFromMemory +responseToCore=system.dir_cntrl0.responseToCore +response_latency=30 +ruby_system=system.ruby +system=system +to_memory_controller_latency=1 +transitions_per_cycle=32 +triggerQueue=system.dir_cntrl0.triggerQueue +unblockFromCores=system.dir_cntrl0.unblockFromCores +useL3OnWT=false +version=0 +memory=system.mem_ctrls.port + +[system.dir_cntrl0.L3CacheMemory] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=20 +dataArrayBanks=256.0 +eventq_index=0 +is_icache=false +replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16777216 +start_index_bit=6 +tagAccessLatency=15 +tagArrayBanks=256.0 + +[system.dir_cntrl0.L3CacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16777216 + +[system.dir_cntrl0.L3triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.directory] +type=RubyDirectoryMemory +eventq_index=0 +numa_high_bit=5 +size=536870912 +version=0 + +[system.dir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[0] + +[system.dir_cntrl0.requestFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[0] + +[system.dir_cntrl0.responseFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[1] + +[system.dir_cntrl0.responseFromMemory] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.dir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[1] + +[system.dir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.unblockFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[2] + +[system.dispatcher_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.dispatcher_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.dispatcher_tlb.slave[0] +slave=system.cpu2.translation_port + +[system.dispatcher_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain + +[system.dispatcher_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dispatcher_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.dispatcher_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[1] +slave=system.dispatcher_coalescer.master[0] + +[system.dispatcher_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain + +[system.dispatcher_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dvfs_handler] +type=DVFSHandler +domains= +enable=false +eventq_index=0 +sys_clk_domain=system.clk_domain +transition_latency=100000000 + +[system.l1_coalescer0] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer0.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb0.slave[0] +slave=system.cpu1.CUs0.translation_port[0] + +[system.l1_coalescer0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain + +[system.l1_coalescer0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_coalescer1] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer1.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb1.slave[0] +slave=system.cpu1.CUs1.translation_port[0] + +[system.l1_coalescer1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain + +[system.l1_coalescer1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb0] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb0.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[2] +slave=system.l1_coalescer0.master[0] + +[system.l1_tlb0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb0.clk_domain.voltage_domain + +[system.l1_tlb0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb1] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb1.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[3] +slave=system.l1_coalescer1.master[0] + +[system.l1_tlb1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb1.clk_domain.voltage_domain + +[system.l1_tlb1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l2_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l2_tlb.slave[0] +slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0] + +[system.l2_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_coalescer.clk_domain.voltage_domain + +[system.l2_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l2_tlb.clk_domain +eventq_index=0 +hitLatency=69 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=4096 +master=system.l3_coalescer.slave[0] +slave=system.l2_coalescer.master[0] + +[system.l2_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_tlb.clk_domain.voltage_domain + +[system.l2_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l3_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l3_tlb.slave[0] +slave=system.l2_tlb.master[0] + +[system.l3_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_coalescer.clk_domain.voltage_domain + +[system.l3_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l3_tlb.clk_domain +eventq_index=0 +hitLatency=150 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=8192 +slave=system.l3_coalescer.master[0] + +[system.l3_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_tlb.clk_domain.voltage_domain + +[system.l3_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.mem_ctrls] +type=DRAMCtrl +IDD0=0.075000 +IDD02=0.000000 +IDD2N=0.050000 +IDD2N2=0.000000 +IDD2P0=0.000000 +IDD2P02=0.000000 +IDD2P1=0.000000 +IDD2P12=0.000000 +IDD3N=0.057000 +IDD3N2=0.000000 +IDD3P0=0.000000 +IDD3P02=0.000000 +IDD3P1=0.000000 +IDD3P12=0.000000 +IDD4R=0.187000 +IDD4R2=0.000000 +IDD4W=0.165000 +IDD4W2=0.000000 +IDD5=0.220000 +IDD52=0.000000 +IDD6=0.000000 +IDD62=0.000000 +VDD=1.500000 +VDD2=0.000000 +activation_limit=4 +addr_mapping=RoRaBaCoCh +bank_groups_per_rank=0 +banks_per_rank=8 +burst_length=8 +channels=1 +clk_domain=system.clk_domain +conf_table_reported=true +device_bus_width=8 +device_rowbuffer_size=1024 +device_size=536870912 +devices_per_rank=8 +dll=true +eventq_index=0 +in_addr_map=true +max_accesses_per_row=16 +mem_sched_policy=frfcfs +min_writes_per_switch=16 +null=false +page_policy=open_adaptive +range=0:536870911 +ranks_per_channel=2 +read_buffer_size=32 +static_backend_latency=10000 +static_frontend_latency=10000 +tBURST=5000 +tCCD_L=0 +tCK=1250 +tCL=13750 +tCS=2500 +tRAS=35000 +tRCD=13750 +tREFI=7800000 +tRFC=260000 +tRP=13750 +tRRD=6000 +tRRD_L=0 +tRTP=7500 +tRTW=2500 +tWR=15000 +tWTR=7500 +tXAW=30000 +tXP=0 +tXPDLL=0 +tXS=0 +tXSDLL=0 +write_buffer_size=64 +write_high_thresh_perc=85 +write_low_thresh_perc=50 +port=system.dir_cntrl0.memory + +[system.piobus] +type=NoncoherentXBar +clk_domain=system.clk_domain +eventq_index=0 +forward_latency=0 +frontend_latency=0 +response_latency=0 +use_default_range=false +width=32 +master=system.cpu2.pio +slave=system.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma + +[system.ruby] +type=RubySystem +children=clk_domain network phys_mem +access_backing_store=true +all_instructions=false +block_size_bytes=64 +clk_domain=system.ruby.clk_domain +eventq_index=0 +hot_lines=false +memory_size_bits=48 +num_of_sequencers=5 +number_of_virtual_networks=10 +phys_mem=system.ruby.phys_mem +randomization=false + +[system.ruby.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.ruby.network] +type=SimpleNetwork +children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 ext_links6 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1 +adaptive_routing=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +control_msg_size=8 +endpoint_bandwidth=1000 +eventq_index=0 +ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5 system.ruby.network.ext_links6 +int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39 +int_links=system.ruby.network.int_links0 system.ruby.network.int_links1 +netifs= +number_of_virtual_networks=10 +routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links1.int_node system.ruby.network.ext_links2.int_node +ruby_system=system.ruby +topology=Crossbar +master=system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.tcc_cntrl0.responseToTCC.slave system.tccdir_cntrl0.requestFromTCP.slave system.tccdir_cntrl0.responseFromTCP.slave system.tccdir_cntrl0.unblockFromTCP.slave system.tccdir_cntrl0.probeFromNB.slave system.tccdir_cntrl0.responseFromNB.slave +slave=system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.sqc_cntrl0.responseFromSQC.master system.sqc_cntrl0.unblockFromCore.master system.tcc_cntrl0.responseFromTCC.master system.tccdir_cntrl0.probeToCore.master system.tccdir_cntrl0.responseToCore.master system.tccdir_cntrl0.requestToNB.master system.tccdir_cntrl0.responseToNB.master system.tccdir_cntrl0.unblockToNB.master + +[system.ruby.network.ext_links0] +type=SimpleExtLink +children=int_node +bandwidth_factor=512 +eventq_index=0 +ext_node=system.dir_cntrl0 +int_node=system.ruby.network.ext_links0.int_node +latency=1 +link_id=0 +weight=1 + +[system.ruby.network.ext_links0.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 port_buffers80 port_buffers81 port_buffers82 port_buffers83 port_buffers84 port_buffers85 port_buffers86 port_buffers87 port_buffers88 port_buffers89 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links0.int_node.port_buffers00 system.ruby.network.ext_links0.int_node.port_buffers01 system.ruby.network.ext_links0.int_node.port_buffers02 system.ruby.network.ext_links0.int_node.port_buffers03 system.ruby.network.ext_links0.int_node.port_buffers04 system.ruby.network.ext_links0.int_node.port_buffers05 system.ruby.network.ext_links0.int_node.port_buffers06 system.ruby.network.ext_links0.int_node.port_buffers07 system.ruby.network.ext_links0.int_node.port_buffers08 system.ruby.network.ext_links0.int_node.port_buffers09 system.ruby.network.ext_links0.int_node.port_buffers10 system.ruby.network.ext_links0.int_node.port_buffers11 system.ruby.network.ext_links0.int_node.port_buffers12 system.ruby.network.ext_links0.int_node.port_buffers13 system.ruby.network.ext_links0.int_node.port_buffers14 system.ruby.network.ext_links0.int_node.port_buffers15 system.ruby.network.ext_links0.int_node.port_buffers16 system.ruby.network.ext_links0.int_node.port_buffers17 system.ruby.network.ext_links0.int_node.port_buffers18 system.ruby.network.ext_links0.int_node.port_buffers19 system.ruby.network.ext_links0.int_node.port_buffers20 system.ruby.network.ext_links0.int_node.port_buffers21 system.ruby.network.ext_links0.int_node.port_buffers22 system.ruby.network.ext_links0.int_node.port_buffers23 system.ruby.network.ext_links0.int_node.port_buffers24 system.ruby.network.ext_links0.int_node.port_buffers25 system.ruby.network.ext_links0.int_node.port_buffers26 system.ruby.network.ext_links0.int_node.port_buffers27 system.ruby.network.ext_links0.int_node.port_buffers28 system.ruby.network.ext_links0.int_node.port_buffers29 system.ruby.network.ext_links0.int_node.port_buffers30 system.ruby.network.ext_links0.int_node.port_buffers31 system.ruby.network.ext_links0.int_node.port_buffers32 system.ruby.network.ext_links0.int_node.port_buffers33 system.ruby.network.ext_links0.int_node.port_buffers34 system.ruby.network.ext_links0.int_node.port_buffers35 system.ruby.network.ext_links0.int_node.port_buffers36 system.ruby.network.ext_links0.int_node.port_buffers37 system.ruby.network.ext_links0.int_node.port_buffers38 system.ruby.network.ext_links0.int_node.port_buffers39 system.ruby.network.ext_links0.int_node.port_buffers40 system.ruby.network.ext_links0.int_node.port_buffers41 system.ruby.network.ext_links0.int_node.port_buffers42 system.ruby.network.ext_links0.int_node.port_buffers43 system.ruby.network.ext_links0.int_node.port_buffers44 system.ruby.network.ext_links0.int_node.port_buffers45 system.ruby.network.ext_links0.int_node.port_buffers46 system.ruby.network.ext_links0.int_node.port_buffers47 system.ruby.network.ext_links0.int_node.port_buffers48 system.ruby.network.ext_links0.int_node.port_buffers49 system.ruby.network.ext_links0.int_node.port_buffers50 system.ruby.network.ext_links0.int_node.port_buffers51 system.ruby.network.ext_links0.int_node.port_buffers52 system.ruby.network.ext_links0.int_node.port_buffers53 system.ruby.network.ext_links0.int_node.port_buffers54 system.ruby.network.ext_links0.int_node.port_buffers55 system.ruby.network.ext_links0.int_node.port_buffers56 system.ruby.network.ext_links0.int_node.port_buffers57 system.ruby.network.ext_links0.int_node.port_buffers58 system.ruby.network.ext_links0.int_node.port_buffers59 system.ruby.network.ext_links0.int_node.port_buffers60 system.ruby.network.ext_links0.int_node.port_buffers61 system.ruby.network.ext_links0.int_node.port_buffers62 system.ruby.network.ext_links0.int_node.port_buffers63 system.ruby.network.ext_links0.int_node.port_buffers64 system.ruby.network.ext_links0.int_node.port_buffers65 system.ruby.network.ext_links0.int_node.port_buffers66 system.ruby.network.ext_links0.int_node.port_buffers67 system.ruby.network.ext_links0.int_node.port_buffers68 system.ruby.network.ext_links0.int_node.port_buffers69 system.ruby.network.ext_links0.int_node.port_buffers70 system.ruby.network.ext_links0.int_node.port_buffers71 system.ruby.network.ext_links0.int_node.port_buffers72 system.ruby.network.ext_links0.int_node.port_buffers73 system.ruby.network.ext_links0.int_node.port_buffers74 system.ruby.network.ext_links0.int_node.port_buffers75 system.ruby.network.ext_links0.int_node.port_buffers76 system.ruby.network.ext_links0.int_node.port_buffers77 system.ruby.network.ext_links0.int_node.port_buffers78 system.ruby.network.ext_links0.int_node.port_buffers79 system.ruby.network.ext_links0.int_node.port_buffers80 system.ruby.network.ext_links0.int_node.port_buffers81 system.ruby.network.ext_links0.int_node.port_buffers82 system.ruby.network.ext_links0.int_node.port_buffers83 system.ruby.network.ext_links0.int_node.port_buffers84 system.ruby.network.ext_links0.int_node.port_buffers85 system.ruby.network.ext_links0.int_node.port_buffers86 system.ruby.network.ext_links0.int_node.port_buffers87 system.ruby.network.ext_links0.int_node.port_buffers88 system.ruby.network.ext_links0.int_node.port_buffers89 +router_id=0 +virt_nets=10 + +[system.ruby.network.ext_links0.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers80] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers81] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers82] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers83] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers84] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers85] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers86] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers87] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers88] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers89] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1] +type=SimpleExtLink +children=int_node +bandwidth_factor=512 +eventq_index=0 +ext_node=system.cp_cntrl0 +int_node=system.ruby.network.ext_links1.int_node +latency=1 +link_id=1 +weight=1 + +[system.ruby.network.ext_links1.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links1.int_node.port_buffers00 system.ruby.network.ext_links1.int_node.port_buffers01 system.ruby.network.ext_links1.int_node.port_buffers02 system.ruby.network.ext_links1.int_node.port_buffers03 system.ruby.network.ext_links1.int_node.port_buffers04 system.ruby.network.ext_links1.int_node.port_buffers05 system.ruby.network.ext_links1.int_node.port_buffers06 system.ruby.network.ext_links1.int_node.port_buffers07 system.ruby.network.ext_links1.int_node.port_buffers08 system.ruby.network.ext_links1.int_node.port_buffers09 system.ruby.network.ext_links1.int_node.port_buffers10 system.ruby.network.ext_links1.int_node.port_buffers11 system.ruby.network.ext_links1.int_node.port_buffers12 system.ruby.network.ext_links1.int_node.port_buffers13 system.ruby.network.ext_links1.int_node.port_buffers14 system.ruby.network.ext_links1.int_node.port_buffers15 system.ruby.network.ext_links1.int_node.port_buffers16 system.ruby.network.ext_links1.int_node.port_buffers17 system.ruby.network.ext_links1.int_node.port_buffers18 system.ruby.network.ext_links1.int_node.port_buffers19 system.ruby.network.ext_links1.int_node.port_buffers20 system.ruby.network.ext_links1.int_node.port_buffers21 system.ruby.network.ext_links1.int_node.port_buffers22 system.ruby.network.ext_links1.int_node.port_buffers23 system.ruby.network.ext_links1.int_node.port_buffers24 system.ruby.network.ext_links1.int_node.port_buffers25 system.ruby.network.ext_links1.int_node.port_buffers26 system.ruby.network.ext_links1.int_node.port_buffers27 system.ruby.network.ext_links1.int_node.port_buffers28 system.ruby.network.ext_links1.int_node.port_buffers29 system.ruby.network.ext_links1.int_node.port_buffers30 system.ruby.network.ext_links1.int_node.port_buffers31 system.ruby.network.ext_links1.int_node.port_buffers32 system.ruby.network.ext_links1.int_node.port_buffers33 system.ruby.network.ext_links1.int_node.port_buffers34 system.ruby.network.ext_links1.int_node.port_buffers35 system.ruby.network.ext_links1.int_node.port_buffers36 system.ruby.network.ext_links1.int_node.port_buffers37 system.ruby.network.ext_links1.int_node.port_buffers38 system.ruby.network.ext_links1.int_node.port_buffers39 system.ruby.network.ext_links1.int_node.port_buffers40 system.ruby.network.ext_links1.int_node.port_buffers41 system.ruby.network.ext_links1.int_node.port_buffers42 system.ruby.network.ext_links1.int_node.port_buffers43 system.ruby.network.ext_links1.int_node.port_buffers44 system.ruby.network.ext_links1.int_node.port_buffers45 system.ruby.network.ext_links1.int_node.port_buffers46 system.ruby.network.ext_links1.int_node.port_buffers47 system.ruby.network.ext_links1.int_node.port_buffers48 system.ruby.network.ext_links1.int_node.port_buffers49 system.ruby.network.ext_links1.int_node.port_buffers50 system.ruby.network.ext_links1.int_node.port_buffers51 system.ruby.network.ext_links1.int_node.port_buffers52 system.ruby.network.ext_links1.int_node.port_buffers53 system.ruby.network.ext_links1.int_node.port_buffers54 system.ruby.network.ext_links1.int_node.port_buffers55 system.ruby.network.ext_links1.int_node.port_buffers56 system.ruby.network.ext_links1.int_node.port_buffers57 system.ruby.network.ext_links1.int_node.port_buffers58 system.ruby.network.ext_links1.int_node.port_buffers59 system.ruby.network.ext_links1.int_node.port_buffers60 system.ruby.network.ext_links1.int_node.port_buffers61 system.ruby.network.ext_links1.int_node.port_buffers62 system.ruby.network.ext_links1.int_node.port_buffers63 system.ruby.network.ext_links1.int_node.port_buffers64 system.ruby.network.ext_links1.int_node.port_buffers65 system.ruby.network.ext_links1.int_node.port_buffers66 system.ruby.network.ext_links1.int_node.port_buffers67 system.ruby.network.ext_links1.int_node.port_buffers68 system.ruby.network.ext_links1.int_node.port_buffers69 system.ruby.network.ext_links1.int_node.port_buffers70 system.ruby.network.ext_links1.int_node.port_buffers71 system.ruby.network.ext_links1.int_node.port_buffers72 system.ruby.network.ext_links1.int_node.port_buffers73 system.ruby.network.ext_links1.int_node.port_buffers74 system.ruby.network.ext_links1.int_node.port_buffers75 system.ruby.network.ext_links1.int_node.port_buffers76 system.ruby.network.ext_links1.int_node.port_buffers77 system.ruby.network.ext_links1.int_node.port_buffers78 system.ruby.network.ext_links1.int_node.port_buffers79 +router_id=1 +virt_nets=10 + +[system.ruby.network.ext_links1.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2] +type=SimpleExtLink +children=int_node +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=2 +weight=1 + +[system.ruby.network.ext_links2.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69 system.ruby.network.ext_links2.int_node.port_buffers70 system.ruby.network.ext_links2.int_node.port_buffers71 system.ruby.network.ext_links2.int_node.port_buffers72 system.ruby.network.ext_links2.int_node.port_buffers73 system.ruby.network.ext_links2.int_node.port_buffers74 system.ruby.network.ext_links2.int_node.port_buffers75 system.ruby.network.ext_links2.int_node.port_buffers76 system.ruby.network.ext_links2.int_node.port_buffers77 system.ruby.network.ext_links2.int_node.port_buffers78 system.ruby.network.ext_links2.int_node.port_buffers79 +router_id=2 +virt_nets=10 + +[system.ruby.network.ext_links2.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links3] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl1 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=3 +weight=1 + +[system.ruby.network.ext_links4] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.sqc_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=4 +weight=1 + +[system.ruby.network.ext_links5] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcc_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=5 +weight=1 + +[system.ruby.network.ext_links6] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tccdir_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=6 +weight=1 + +[system.ruby.network.int_link_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_links0] +type=SimpleIntLink +bandwidth_factor=512 +eventq_index=0 +latency=1 +link_id=0 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links1.int_node +weight=1 + +[system.ruby.network.int_links1] +type=SimpleIntLink +bandwidth_factor=512 +eventq_index=0 +latency=1 +link_id=1 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links2.int_node +weight=1 + +[system.ruby.phys_mem] +type=SimpleMemory +bandwidth=73.000000 +clk_domain=system.ruby.clk_domain +conf_table_reported=true +eventq_index=0 +in_addr_map=false +latency=30000 +latency_var=0 +null=false +range=0:536870911 + +[system.sqc_cntrl0] +type=SQC_Controller +children=L1cache mandatoryQueue probeToSQC requestFromSQC responseFromSQC responseToSQC sequencer unblockFromCore +L1cache=system.sqc_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=80 +l2_hit_latency=18 +mandatoryQueue=system.sqc_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToSQC=system.sqc_cntrl0.probeToSQC +recycle_latency=10 +requestFromSQC=system.sqc_cntrl0.requestFromSQC +responseFromSQC=system.sqc_cntrl0.responseFromSQC +responseToSQC=system.sqc_cntrl0.responseToSQC +ruby_system=system.ruby +sequencer=system.sqc_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.sqc_cntrl0.unblockFromCore +version=0 + +[system.sqc_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=4 + +[system.sqc_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=32768 + +[system.sqc_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.sqc_cntrl0.probeToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[9] + +[system.sqc_cntrl0.requestFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[11] + +[system.sqc_cntrl0.responseFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[12] + +[system.sqc_cntrl0.responseToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[10] + +[system.sqc_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.sqc_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.sqc_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=false +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=6 +slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port + +[system.sqc_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[13] + +[system.sqc_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.sqc_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.sqc_tlb.slave[0] +slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port + +[system.sqc_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain + +[system.sqc_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sqc_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.sqc_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[0] +slave=system.sqc_coalescer.master[0] + +[system.sqc_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_tlb.clk_domain.voltage_domain + +[system.sqc_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sys_port_proxy] +type=RubyPortProxy +clk_domain=system.clk_domain +eventq_index=0 +is_cpu_sequencer=true +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_ruby_tester=false +version=0 +slave=system.system_port + +[system.tcc_cntrl0] +type=TCC_Controller +children=L2cache responseFromTCC responseToTCC w_TCCUnblockToTCCDir w_probeToTCC w_reqToTCC w_reqToTCCDir w_respToTCC w_respToTCCDir +L2cache=system.tcc_cntrl0.L2cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +l2_request_latency=1 +l2_response_latency=16 +number_of_TBEs=2048 +recycle_latency=10 +responseFromTCC=system.tcc_cntrl0.responseFromTCC +responseToTCC=system.tcc_cntrl0.responseToTCC +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +version=0 +w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir +w_probeToTCC=system.tcc_cntrl0.w_probeToTCC +w_reqToTCC=system.tcc_cntrl0.w_reqToTCC +w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir +w_respToTCC=system.tcc_cntrl0.w_respToTCC +w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir + +[system.tcc_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=8 +dataArrayBanks=256 +eventq_index=0 +is_icache=false +replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=262144.0 +start_index_bit=6 +tagAccessLatency=2 +tagArrayBanks=256 + +[system.tcc_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=262144.0 + +[system.tcc_cntrl0.responseFromTCC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[14] + +[system.tcc_cntrl0.responseToTCC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[11] + +[system.tcc_cntrl0.w_TCCUnblockToTCCDir] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_probeToTCC] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_reqToTCC] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_reqToTCCDir] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_respToTCC] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_respToTCCDir] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tccdir_cntrl0] +type=TCCdir_Controller +children=directory probeFromNB probeToCore requestFromTCP requestToNB responseFromNB responseFromTCP responseToCore responseToNB triggerQueue unblockFromTCP unblockToNB +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +directory=system.tccdir_cntrl0.directory +directory_latency=6 +eventq_index=0 +issue_latency=120 +number_of_TBEs=1024 +probeFromNB=system.tccdir_cntrl0.probeFromNB +probeToCore=system.tccdir_cntrl0.probeToCore +recycle_latency=10 +requestFromTCP=system.tccdir_cntrl0.requestFromTCP +requestToNB=system.tccdir_cntrl0.requestToNB +responseFromNB=system.tccdir_cntrl0.responseFromNB +responseFromTCP=system.tccdir_cntrl0.responseFromTCP +responseToCore=system.tccdir_cntrl0.responseToCore +responseToNB=system.tccdir_cntrl0.responseToNB +response_latency=5 +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +triggerQueue=system.tccdir_cntrl0.triggerQueue +unblockFromTCP=system.tccdir_cntrl0.unblockFromTCP +unblockToNB=system.tccdir_cntrl0.unblockToNB +version=0 +w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir +w_probeToTCC=system.tcc_cntrl0.w_probeToTCC +w_reqToTCC=system.tcc_cntrl0.w_reqToTCC +w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir +w_respToTCC=system.tcc_cntrl0.w_respToTCC +w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir + +[system.tccdir_cntrl0.directory] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.tccdir_cntrl0.directory.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=393216 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.tccdir_cntrl0.directory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=393216 + +[system.tccdir_cntrl0.probeFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[15] + +[system.tccdir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[15] + +[system.tccdir_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[12] + +[system.tccdir_cntrl0.requestToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[17] + +[system.tccdir_cntrl0.responseFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[16] + +[system.tccdir_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[13] + +[system.tccdir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[16] + +[system.tccdir_cntrl0.responseToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[18] + +[system.tccdir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.tccdir_cntrl0.unblockFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[14] + +[system.tccdir_cntrl0.unblockToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[19] + +[system.tcp_cntrl0] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl0.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl0.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl0.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl0.requestFromTCP +responseFromTCP=system.tcp_cntrl0.responseFromTCP +responseToTCP=system.tcp_cntrl0.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl0.unblockFromCore +use_seq_not_coal=false +version=0 + +[system.tcp_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl0.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2048 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=2 +slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63] + +[system.tcp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl0.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[5] + +[system.tcp_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[5] + +[system.tcp_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[6] + +[system.tcp_cntrl0.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[6] + +[system.tcp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=3 + +[system.tcp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[7] + +[system.tcp_cntrl1] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl1.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl1.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl1.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl1.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl1.requestFromTCP +responseFromTCP=system.tcp_cntrl1.responseFromTCP +responseToTCP=system.tcp_cntrl1.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl1.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl1.unblockFromCore +use_seq_not_coal=false +version=1 + +[system.tcp_cntrl1.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl1.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl1.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2048 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=4 +slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63] + +[system.tcp_cntrl1.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl1.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[7] + +[system.tcp_cntrl1.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[8] + +[system.tcp_cntrl1.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[9] + +[system.tcp_cntrl1.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[8] + +[system.tcp_cntrl1.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=5 + +[system.tcp_cntrl1.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[10] + +[system.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simerr new file mode 100755 index 000000000..1e2b8911e --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simerr @@ -0,0 +1,5 @@ +warn: system.ruby.network adopting orphan SimObject param 'int_links' +warn: system.ruby.network adopting orphan SimObject param 'ext_links' +warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes) +warn: Sockets disabled, not accepting gdb connections +warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files! diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simout new file mode 100755 index 000000000..98757d4d3 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simout @@ -0,0 +1,21 @@ +gem5 Simulator System. http://gem5.org +gem5 is copyrighted software; use the --copyright option for details. + +gem5 compiled Jan 19 2016 13:28:55 +gem5 started Jan 19 2016 13:29:16 +gem5 executing on zizzer, pid 48854 +command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_RfO -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_RfO + +Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +Global frequency set at 1000000000000 ticks per second +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +info: Entering event queue @ 0. Starting simulation... +keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23 +the gpu says: +elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe +Exiting @ tick 663454500 because target called exit() diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/stats.txt new file mode 100644 index 000000000..ac9e12c7a --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/stats.txt @@ -0,0 +1,3202 @@ + +---------- Begin Simulation Statistics ---------- +sim_seconds 0.000663 # Number of seconds simulated +sim_ticks 663454500 # Number of ticks simulated +final_tick 663454500 # Number of ticks from beginning of simulation (restored from checkpoints and never reset) +sim_freq 1000000000000 # Frequency of simulated ticks +host_inst_rate 63999 # Simulator instruction rate (inst/s) +host_op_rate 131608 # Simulator op (including micro ops) rate (op/s) +host_tick_rate 634065338 # Simulator tick rate (ticks/s) +host_mem_usage 1301448 # Number of bytes of host memory used +host_seconds 1.05 # Real time elapsed on the host +sim_insts 66963 # Number of instructions simulated +sim_ops 137705 # Number of ops (including micro ops) simulated +system.voltage_domain.voltage 1 # Voltage in Volts +system.clk_domain.clock 1000 # Clock period in ticks +system.mem_ctrls.bytes_read::dir_cntrl0 99264 # Number of bytes read from this memory +system.mem_ctrls.bytes_read::total 99264 # Number of bytes read from this memory +system.mem_ctrls.num_reads::dir_cntrl0 1551 # Number of read requests responded to by this memory +system.mem_ctrls.num_reads::total 1551 # Number of read requests responded to by this memory +system.mem_ctrls.bw_read::dir_cntrl0 149616892 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_read::total 149616892 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_total::dir_cntrl0 149616892 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.bw_total::total 149616892 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.readReqs 1551 # Number of read requests accepted +system.mem_ctrls.writeReqs 0 # Number of write requests accepted +system.mem_ctrls.readBursts 1551 # Number of DRAM read bursts, including those serviced by the write queue +system.mem_ctrls.writeBursts 0 # Number of DRAM write bursts, including those merged in the write queue +system.mem_ctrls.bytesReadDRAM 99264 # Total number of bytes read from DRAM +system.mem_ctrls.bytesReadWrQ 0 # Total number of bytes read from write queue +system.mem_ctrls.bytesWritten 0 # Total number of bytes written to DRAM +system.mem_ctrls.bytesReadSys 99264 # Total read bytes from the system interface side +system.mem_ctrls.bytesWrittenSys 0 # Total written bytes from the system interface side +system.mem_ctrls.servicedByWrQ 0 # Number of DRAM read bursts serviced by the write queue +system.mem_ctrls.mergedWrBursts 0 # Number of DRAM write bursts merged with an existing one +system.mem_ctrls.neitherReadNorWriteReqs 0 # Number of requests that are neither read nor write +system.mem_ctrls.perBankRdBursts::0 122 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::1 192 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::2 93 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::3 44 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::4 61 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::5 79 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::6 52 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::7 42 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::8 54 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::9 56 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::10 174 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::11 90 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::12 222 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::13 125 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::14 51 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::15 94 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::0 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::1 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::2 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::3 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::4 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::5 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::6 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::7 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::8 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::9 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::10 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::11 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::12 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::13 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::14 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::15 0 # Per bank write bursts +system.mem_ctrls.numRdRetry 0 # Number of times read queue was full causing retry +system.mem_ctrls.numWrRetry 0 # Number of times write queue was full causing retry +system.mem_ctrls.totGap 663221000 # Total gap between requests +system.mem_ctrls.readPktSize::0 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::1 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::2 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::3 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::4 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::5 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::6 1551 # Read request sizes (log2) +system.mem_ctrls.writePktSize::0 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::1 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::2 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::3 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::4 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::5 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::6 0 # Write request sizes (log2) +system.mem_ctrls.rdQLenPdf::0 1542 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::1 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::2 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::3 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::4 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::5 3 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::6 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::7 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::8 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::9 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::10 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::11 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::12 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::13 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::14 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::15 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::16 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::17 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::18 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::19 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::20 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::21 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::22 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::23 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::24 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::25 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::26 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::27 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::28 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::29 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::30 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::31 0 # What read queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::0 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::1 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::2 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::3 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::4 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::5 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::6 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::7 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::8 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::9 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::10 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::11 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::12 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::13 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::14 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::15 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::16 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::17 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::18 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::19 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::20 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::21 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::22 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::23 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::24 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::25 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::26 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::27 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::28 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::29 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::30 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::31 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::32 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::33 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::34 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::35 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::36 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::37 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::38 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::39 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::40 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::41 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::42 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::43 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::44 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::45 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::46 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::47 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::48 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::49 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::50 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::51 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::52 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::53 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::54 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::55 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::56 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::57 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::58 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::59 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::60 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::61 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::62 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::63 0 # What write queue length does an incoming req see +system.mem_ctrls.bytesPerActivate::samples 485 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::mean 204.008247 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::gmean 145.772769 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::stdev 192.306659 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::0-127 178 36.70% 36.70% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::128-255 156 32.16% 68.87% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::256-383 70 14.43% 83.30% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::384-511 40 8.25% 91.55% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::512-639 15 3.09% 94.64% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::640-767 10 2.06% 96.70% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::768-895 9 1.86% 98.56% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::896-1023 2 0.41% 98.97% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::1024-1151 5 1.03% 100.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::total 485 # Bytes accessed per row activation +system.mem_ctrls.totQLat 15500500 # Total ticks spent queuing +system.mem_ctrls.totMemAccLat 44581750 # Total ticks spent from burst creation until serviced by the DRAM +system.mem_ctrls.totBusLat 7755000 # Total ticks spent in databus transfers +system.mem_ctrls.avgQLat 9993.87 # Average queueing delay per DRAM burst +system.mem_ctrls.avgBusLat 5000.00 # Average bus latency per DRAM burst +system.mem_ctrls.avgMemAccLat 28743.87 # Average memory access latency per DRAM burst +system.mem_ctrls.avgRdBW 149.62 # Average DRAM read bandwidth in MiByte/s +system.mem_ctrls.avgWrBW 0.00 # Average achieved write bandwidth in MiByte/s +system.mem_ctrls.avgRdBWSys 149.62 # Average system read bandwidth in MiByte/s +system.mem_ctrls.avgWrBWSys 0.00 # Average system write bandwidth in MiByte/s +system.mem_ctrls.peakBW 12800.00 # Theoretical peak bandwidth in MiByte/s +system.mem_ctrls.busUtil 1.17 # Data bus utilization in percentage +system.mem_ctrls.busUtilRead 1.17 # Data bus utilization in percentage for reads +system.mem_ctrls.busUtilWrite 0.00 # Data bus utilization in percentage for writes +system.mem_ctrls.avgRdQLen 1.00 # Average read queue length when enqueuing +system.mem_ctrls.avgWrQLen 0.00 # Average write queue length when enqueuing +system.mem_ctrls.readRowHits 1062 # Number of row buffer hits during reads +system.mem_ctrls.writeRowHits 0 # Number of row buffer hits during writes +system.mem_ctrls.readRowHitRate 68.47 # Row buffer hit rate for reads +system.mem_ctrls.writeRowHitRate nan # Row buffer hit rate for writes +system.mem_ctrls.avgGap 427608.64 # Average gap between requests +system.mem_ctrls.pageHitRate 68.47 # Row buffer hit rate, read and write combined +system.mem_ctrls_0.actEnergy 1391040 # Energy for activate commands per rank (pJ) +system.mem_ctrls_0.preEnergy 759000 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_0.readEnergy 5335200 # Energy for read commands per rank (pJ) +system.mem_ctrls_0.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_0.refreshEnergy 43227600 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_0.actBackEnergy 335485755 # Energy for active background per rank (pJ) +system.mem_ctrls_0.preBackEnergy 102969000 # Energy for precharge background per rank (pJ) +system.mem_ctrls_0.totalEnergy 489167595 # Total energy per rank (pJ) +system.mem_ctrls_0.averagePower 738.822020 # Core power per rank (mW) +system.mem_ctrls_0.memoryStateTime::IDLE 170399250 # Time in different power states +system.mem_ctrls_0.memoryStateTime::REF 22100000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT 470741750 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT_PDN 0 # Time in different power states +system.mem_ctrls_1.actEnergy 2275560 # Energy for activate commands per rank (pJ) +system.mem_ctrls_1.preEnergy 1241625 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_1.readEnergy 6723600 # Energy for read commands per rank (pJ) +system.mem_ctrls_1.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_1.refreshEnergy 43227600 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_1.actBackEnergy 371983995 # Energy for active background per rank (pJ) +system.mem_ctrls_1.preBackEnergy 70953000 # Energy for precharge background per rank (pJ) +system.mem_ctrls_1.totalEnergy 496405380 # Total energy per rank (pJ) +system.mem_ctrls_1.averagePower 749.753724 # Core power per rank (mW) +system.mem_ctrls_1.memoryStateTime::IDLE 115859750 # Time in different power states +system.mem_ctrls_1.memoryStateTime::REF 22100000 # Time in different power states +system.mem_ctrls_1.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT 524145250 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT_PDN 0 # Time in different power states +system.ruby.clk_domain.clock 500 # Clock period in ticks +system.ruby.phys_mem.bytes_read::cpu0.inst 696760 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu0.data 119832 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::total 823152 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu0.inst 696760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::total 700760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_written::cpu0.data 72767 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::total 73279 # Number of bytes written to this memory +system.ruby.phys_mem.num_reads::cpu0.inst 87095 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu0.data 16686 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::total 104891 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu0.data 10422 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::total 10934 # Number of write requests responded to by this memory +system.ruby.phys_mem.bw_read::cpu0.inst 1050200127 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu0.data 180618264 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit 4943821 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit 4943821 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::total 1240706032 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu0.inst 1050200127 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit 3014525 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit 3014525 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::total 1056229176 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu0.data 109678961 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit 385859 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit 385859 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::total 110450679 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.inst 1050200127 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.data 290297225 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit 5329680 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit 5329680 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::total 1351156711 # Total bandwidth to/from this memory (bytes/s) +system.ruby.outstanding_req_hist::bucket_size 1 +system.ruby.outstanding_req_hist::max_bucket 9 +system.ruby.outstanding_req_hist::samples 114203 +system.ruby.outstanding_req_hist::mean 1.000035 +system.ruby.outstanding_req_hist::gmean 1.000024 +system.ruby.outstanding_req_hist::stdev 0.005918 +system.ruby.outstanding_req_hist | 0 0.00% 0.00% | 114199 100.00% 100.00% | 4 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.outstanding_req_hist::total 114203 +system.ruby.latency_hist::bucket_size 64 +system.ruby.latency_hist::max_bucket 639 +system.ruby.latency_hist::samples 114203 +system.ruby.latency_hist::mean 4.784183 +system.ruby.latency_hist::gmean 2.131364 +system.ruby.latency_hist::stdev 23.846744 +system.ruby.latency_hist | 112668 98.66% 98.66% | 0 0.00% 98.66% | 0 0.00% 98.66% | 1506 1.32% 99.97% | 19 0.02% 99.99% | 10 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.latency_hist::total 114203 +system.ruby.hit_latency_hist::bucket_size 64 +system.ruby.hit_latency_hist::max_bucket 639 +system.ruby.hit_latency_hist::samples 1535 +system.ruby.hit_latency_hist::mean 208.449511 +system.ruby.hit_latency_hist::gmean 208.002927 +system.ruby.hit_latency_hist::stdev 15.847049 +system.ruby.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1506 98.11% 98.11% | 19 1.24% 99.35% | 10 0.65% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.hit_latency_hist::total 1535 +system.ruby.miss_latency_hist::bucket_size 4 +system.ruby.miss_latency_hist::max_bucket 39 +system.ruby.miss_latency_hist::samples 112668 +system.ruby.miss_latency_hist::mean 2.009426 +system.ruby.miss_latency_hist::gmean 2.002413 +system.ruby.miss_latency_hist::stdev 0.411800 +system.ruby.miss_latency_hist | 112609 99.95% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 59 0.05% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.miss_latency_hist::total 112668 +system.ruby.L1Cache.incomplete_times 112609 +system.ruby.L2Cache.incomplete_times 59 +system.cp_cntrl0.L1D0cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D0cache.demand_misses 506 # Number of cache demand misses +system.cp_cntrl0.L1D0cache.demand_accesses 506 # Number of cache demand accesses +system.cp_cntrl0.L1D0cache.num_data_array_reads 16155 # number of data array reads +system.cp_cntrl0.L1D0cache.num_data_array_writes 11985 # number of data array writes +system.cp_cntrl0.L1D0cache.num_tag_array_reads 27132 # number of tag array reads +system.cp_cntrl0.L1D0cache.num_tag_array_writes 1584 # number of tag array writes +system.cp_cntrl0.L1D1cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D1cache.demand_misses 0 # Number of cache demand misses +system.cp_cntrl0.L1D1cache.demand_accesses 0 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1Icache.demand_misses 1088 # Number of cache demand misses +system.cp_cntrl0.L1Icache.demand_accesses 1088 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.num_data_array_reads 86007 # number of data array reads +system.cp_cntrl0.L1Icache.num_data_array_writes 54 # number of data array writes +system.cp_cntrl0.L1Icache.num_tag_array_reads 87684 # number of tag array reads +system.cp_cntrl0.L1Icache.num_tag_array_writes 54 # number of tag array writes +system.cp_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L2cache.demand_misses 1535 # Number of cache demand misses +system.cp_cntrl0.L2cache.demand_accesses 1535 # Number of cache demand accesses +system.cp_cntrl0.L2cache.num_data_array_reads 120 # number of data array reads +system.cp_cntrl0.L2cache.num_data_array_writes 11982 # number of data array writes +system.cp_cntrl0.L2cache.num_tag_array_reads 12059 # number of tag array reads +system.cp_cntrl0.L2cache.num_tag_array_writes 1649 # number of tag array writes +system.cpu0.clk_domain.clock 500 # Clock period in ticks +system.cpu0.apic_clk_domain.clock 8000 # Clock period in ticks +system.cpu0.workload.num_syscalls 21 # Number of system calls +system.cpu0.numCycles 1326909 # number of cpu cycles simulated +system.cpu0.numWorkItemsStarted 0 # number of work items this cpu started +system.cpu0.numWorkItemsCompleted 0 # number of work items this cpu completed +system.cpu0.committedInsts 66963 # Number of instructions committed +system.cpu0.committedOps 137705 # Number of ops (including micro ops) committed +system.cpu0.num_int_alu_accesses 136380 # Number of integer alu accesses +system.cpu0.num_fp_alu_accesses 1279 # Number of float alu accesses +system.cpu0.num_func_calls 3196 # number of times a function call or return occured +system.cpu0.num_conditional_control_insts 12151 # number of instructions that are conditional controls +system.cpu0.num_int_insts 136380 # number of integer instructions +system.cpu0.num_fp_insts 1279 # number of float instructions +system.cpu0.num_int_register_reads 257490 # number of times the integer registers were read +system.cpu0.num_int_register_writes 110039 # number of times the integer registers were written +system.cpu0.num_fp_register_reads 1981 # number of times the floating registers were read +system.cpu0.num_fp_register_writes 981 # number of times the floating registers were written +system.cpu0.num_cc_register_reads 78262 # number of times the CC registers were read +system.cpu0.num_cc_register_writes 42183 # number of times the CC registers were written +system.cpu0.num_mem_refs 27198 # number of memory refs +system.cpu0.num_load_insts 16684 # Number of load instructions +system.cpu0.num_store_insts 10514 # Number of store instructions +system.cpu0.num_idle_cycles 5227.003992 # Number of idle cycles +system.cpu0.num_busy_cycles 1321681.996008 # Number of busy cycles +system.cpu0.not_idle_fraction 0.996061 # Percentage of non-idle cycles +system.cpu0.idle_fraction 0.003939 # Percentage of idle cycles +system.cpu0.Branches 16199 # Number of branches fetched +system.cpu0.op_class::No_OpClass 615 0.45% 0.45% # Class of executed instruction +system.cpu0.op_class::IntAlu 108791 79.00% 79.45% # Class of executed instruction +system.cpu0.op_class::IntMult 13 0.01% 79.46% # Class of executed instruction +system.cpu0.op_class::IntDiv 138 0.10% 79.56% # Class of executed instruction +system.cpu0.op_class::FloatAdd 950 0.69% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAddAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShift 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShiftAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::MemRead 16684 12.12% 92.36% # Class of executed instruction +system.cpu0.op_class::MemWrite 10514 7.64% 100.00% # Class of executed instruction +system.cpu0.op_class::IprAccess 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::InstPrefetch 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::total 137705 # Class of executed instruction +system.cpu1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.cpu1.clk_domain.clock 1000 # Clock period in ticks +system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies 297 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies 273 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies 272 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies 256 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue 3230 # number of cycles the CU issues nothing +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued 128 # number of cycles the CU issued at least one instruction +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 780 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 367 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 384 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 327 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM 414 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM 30 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.spc::samples 3358 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::mean 0.041989 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::stdev 0.220406 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::0 3230 96.19% 96.19% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::1 116 3.45% 99.64% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::2 11 0.33% 99.97% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::3 1 0.03% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::total 3358 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle 82 # number of CU transitions from active to idle +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples 82 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean 39.280488 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev 158.161058 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4 62 75.61% 75.61% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9 9 10.98% 86.59% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14 1 1.22% 87.80% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19 0 0.00% 87.80% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24 2 2.44% 90.24% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29 1 1.22% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75 0 0.00% 91.46% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows 7 8.54% 100.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value 1285 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total 82 # duration of idle periods in cycles +system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs0.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs0.tlb_cycles -452460956000 # total number of cycles for all uncoalesced requests +system.cpu1.CUs0.avg_translation_latency -588375755.526658 # Avg. translation latency for data translations +system.cpu1.CUs0.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.lds_bank_access_cnt 54 # Total number of LDS bank accesses +system.cpu1.CUs0.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::mean 8 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::stdev 6.196773 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::10-11 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::12-13 4 66.67% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs0.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs0.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs0.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs0.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::mean 86.382979 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::stdev 229.391669 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::4-5 51 36.17% 45.39% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::6-7 32 22.70% 68.09% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::8-9 2 1.42% 69.50% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::10 2 1.42% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::overflows 41 29.08% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::max_value 1291 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.num_vec_ops_executed 6769 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs0.num_total_cycles 3358 # number of cycles the CU ran for +system.cpu1.CUs0.vpc 2.015783 # Vector Operations per cycle (this CU only) +system.cpu1.CUs0.ipc 0.041989 # Instructions per cycle (this CU only) +system.cpu1.CUs0.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::mean 48.007092 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::stdev 23.719942 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::9-12 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::13-16 36 25.53% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::mean 37.833333 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::stdev 27.064737 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::9-12 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::13-16 8 44.44% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::mean 19.500000 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::stdev 22.322634 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::9-12 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::13-16 4 66.67% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs0.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs0.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs0.num_completed_wfs 4 # number of completed wavefronts +system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies 381 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies 356 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies 356 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies 339 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue 3228 # number of cycles the CU issues nothing +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued 130 # number of cycles the CU issued at least one instruction +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 778 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 472 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 447 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 411 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM 417 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM 26 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.spc::samples 3358 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::mean 0.041989 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::stdev 0.217686 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::0 3228 96.13% 96.13% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::1 120 3.57% 99.70% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::2 9 0.27% 99.97% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::3 1 0.03% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::total 3358 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle 81 # number of CU transitions from active to idle +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples 81 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean 38.617284 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev 158.076213 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4 60 74.07% 74.07% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9 10 12.35% 86.42% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14 0 0.00% 86.42% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19 2 2.47% 88.89% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24 2 2.47% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75 0 0.00% 91.36% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows 7 8.64% 100.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value 1293 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total 81 # duration of idle periods in cycles +system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs1.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs1.tlb_cycles -452466433000 # total number of cycles for all uncoalesced requests +system.cpu1.CUs1.avg_translation_latency -588382877.763329 # Avg. translation latency for data translations +system.cpu1.CUs1.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.lds_bank_access_cnt 53 # Total number of LDS bank accesses +system.cpu1.CUs1.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::mean 7.833333 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::stdev 6.080022 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::10-11 1 16.67% 50.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::12-13 3 50.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs1.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs1.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs1.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs1.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::mean 85.666667 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::stdev 230.212531 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::4-5 52 36.88% 46.10% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::6-7 33 23.40% 69.50% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::8-9 4 2.84% 72.34% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::10 1 0.71% 73.05% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::overflows 38 26.95% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::max_value 1299 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.num_vec_ops_executed 6762 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs1.num_total_cycles 3358 # number of cycles the CU ran for +system.cpu1.CUs1.vpc 2.013699 # Vector Operations per cycle (this CU only) +system.cpu1.CUs1.ipc 0.041989 # Instructions per cycle (this CU only) +system.cpu1.CUs1.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::mean 47.957447 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::stdev 23.818022 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::9-12 9 6.38% 9.93% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::13-16 27 19.15% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::mean 37.722222 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::stdev 27.174394 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::9-12 2 11.11% 16.67% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::13-16 6 33.33% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::mean 19.333333 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::stdev 22.384518 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::9-12 1 16.67% 33.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::13-16 3 50.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs1.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs1.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs1.num_completed_wfs 4 # number of completed wavefronts +system.cpu2.num_kernel_launched 1 # number of kernel launched +system.dir_cntrl0.L3CacheMemory.demand_hits 0 # Number of cache demand hits +system.dir_cntrl0.L3CacheMemory.demand_misses 0 # Number of cache demand misses +system.dir_cntrl0.L3CacheMemory.demand_accesses 0 # Number of cache demand accesses +system.dir_cntrl0.L3CacheMemory.num_data_array_writes 1551 # number of data array writes +system.dir_cntrl0.L3CacheMemory.num_tag_array_reads 1551 # number of tag array reads +system.dir_cntrl0.L3CacheMemory.num_tag_array_writes 1551 # number of tag array writes +system.dispatcher_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_coalescer.uncoalesced_accesses 0 # Number of uncoalesced TLB accesses +system.dispatcher_coalescer.coalesced_accesses 0 # Number of coalesced TLB accesses +system.dispatcher_coalescer.queuing_cycles 0 # Number of cycles spent in queue +system.dispatcher_coalescer.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_coalescer.local_latency nan # Avg. latency over all incoming pkts +system.dispatcher_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_tlb.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_tlb.local_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.local_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.local_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.local_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.global_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.global_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.global_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.global_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.access_cycles 0 # Cycles spent accessing this TLB level +system.dispatcher_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.dispatcher_tlb.unique_pages 0 # Number of unique pages touched +system.dispatcher_tlb.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_tlb.local_latency nan # Avg. latency over incoming coalesced reqs +system.dispatcher_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_coalescer0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer0.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer0.uncoalesced_accesses 778 # Number of uncoalesced TLB accesses +system.l1_coalescer0.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer0.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer0.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer0.local_latency 0 # Avg. latency over all incoming pkts +system.l1_coalescer1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer1.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer1.uncoalesced_accesses 769 # Number of uncoalesced TLB accesses +system.l1_coalescer1.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer1.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer1.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer1.local_latency 0 # Avg. latency over all incoming pkts +system.l1_tlb0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb0.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb0.local_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.local_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.local_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.local_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.global_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.global_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.global_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.global_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb0.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb0.unique_pages 4 # Number of unique pages touched +system.l1_tlb0.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb0.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb0.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_tlb1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb1.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb1.local_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.local_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.local_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.local_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.global_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.global_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.global_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.global_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb1.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb1.unique_pages 3 # Number of unique pages touched +system.l1_tlb1.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb1.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb1.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l2_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l2_coalescer.uncoalesced_accesses 8 # Number of uncoalesced TLB accesses +system.l2_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l2_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l2_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l2_coalescer.local_latency 125 # Avg. latency over all incoming pkts +system.l2_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l2_tlb.local_TLB_accesses 8 # Number of TLB accesses +system.l2_tlb.local_TLB_hits 3 # Number of TLB hits +system.l2_tlb.local_TLB_misses 5 # Number of TLB misses +system.l2_tlb.local_TLB_miss_rate 62.500000 # TLB miss rate +system.l2_tlb.global_TLB_accesses 15 # Number of TLB accesses +system.l2_tlb.global_TLB_hits 3 # Number of TLB hits +system.l2_tlb.global_TLB_misses 12 # Number of TLB misses +system.l2_tlb.global_TLB_miss_rate 80 # TLB miss rate +system.l2_tlb.access_cycles 552008 # Cycles spent accessing this TLB level +system.l2_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.l2_tlb.unique_pages 5 # Number of unique pages touched +system.l2_tlb.local_cycles 69001 # Number of cycles spent in queue for all incoming reqs +system.l2_tlb.local_latency 8625.125000 # Avg. latency over incoming coalesced reqs +system.l2_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l3_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l3_coalescer.uncoalesced_accesses 5 # Number of uncoalesced TLB accesses +system.l3_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l3_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l3_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l3_coalescer.local_latency 200 # Avg. latency over all incoming pkts +system.l3_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l3_tlb.local_TLB_accesses 5 # Number of TLB accesses +system.l3_tlb.local_TLB_hits 0 # Number of TLB hits +system.l3_tlb.local_TLB_misses 5 # Number of TLB misses +system.l3_tlb.local_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.global_TLB_accesses 12 # Number of TLB accesses +system.l3_tlb.global_TLB_hits 0 # Number of TLB hits +system.l3_tlb.global_TLB_misses 12 # Number of TLB misses +system.l3_tlb.global_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.access_cycles 1200000 # Cycles spent accessing this TLB level +system.l3_tlb.page_table_cycles 6000000 # Cycles spent accessing the page table +system.l3_tlb.unique_pages 5 # Number of unique pages touched +system.l3_tlb.local_cycles 150000 # Number of cycles spent in queue for all incoming reqs +system.l3_tlb.local_latency 30000 # Avg. latency over incoming coalesced reqs +system.l3_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.piobus.trans_dist::WriteReq 94 # Transaction distribution +system.piobus.trans_dist::WriteResp 94 # Transaction distribution +system.piobus.pkt_count_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_count::total 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_size_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.pkt_size::total 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.reqLayer0.occupancy 188000 # Layer occupancy (ticks) +system.piobus.reqLayer0.utilization 0.0 # Layer utilization (%) +system.piobus.respLayer0.occupancy 94000 # Layer occupancy (ticks) +system.piobus.respLayer0.utilization 0.0 # Layer utilization (%) +system.ruby.network.ext_links0.int_node.percent_links_utilized 0.007952 +system.ruby.network.ext_links0.int_node.msg_count.Control::0 1551 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0 1551 +system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2 1563 +system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2 1539 +system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4 1551 +system.ruby.network.ext_links0.int_node.msg_bytes.Control::0 12408 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0 12408 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2 112536 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2 12312 +system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4 12408 +system.ruby.network.ext_links1.int_node.percent_links_utilized 0.009970 +system.ruby.network.ext_links1.int_node.msg_count.Control::0 16 +system.ruby.network.ext_links1.int_node.msg_count.Request_Control::0 1535 +system.ruby.network.ext_links1.int_node.msg_count.Response_Data::2 1537 +system.ruby.network.ext_links1.int_node.msg_count.Response_Control::2 14 +system.ruby.network.ext_links1.int_node.msg_count.Unblock_Control::4 1535 +system.ruby.network.ext_links1.int_node.msg_bytes.Control::0 128 +system.ruby.network.ext_links1.int_node.msg_bytes.Request_Control::0 12280 +system.ruby.network.ext_links1.int_node.msg_bytes.Response_Data::2 110664 +system.ruby.network.ext_links1.int_node.msg_bytes.Response_Control::2 112 +system.ruby.network.ext_links1.int_node.msg_bytes.Unblock_Control::4 12280 +system.tcp_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl0.L1cache.num_data_array_reads 10 # number of data array reads +system.tcp_cntrl0.L1cache.num_data_array_writes 11 # number of data array writes +system.tcp_cntrl0.L1cache.num_tag_array_reads 27 # number of tag array reads +system.tcp_cntrl0.L1cache.num_tag_array_writes 18 # number of tag array writes +system.tcp_cntrl0.L1cache.num_tag_array_stalls 2 # number of stalls caused by tag array +system.tcp_cntrl0.L1cache.num_data_array_stalls 2 # number of stalls caused by data array +system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits 3 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_ld_misses 2 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.gpu_tcp_st_hits 4 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers 1 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_st_misses 4 # stores that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.ruby.network.ext_links2.int_node.percent_links_utilized 0.000721 +system.ruby.network.ext_links2.int_node.msg_count.Control::0 1535 +system.ruby.network.ext_links2.int_node.msg_count.Control::1 14 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0 16 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::1 19 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2 26 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::3 33 +system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2 1525 +system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4 16 +system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::5 19 +system.ruby.network.ext_links2.int_node.msg_bytes.Control::0 12280 +system.ruby.network.ext_links2.int_node.msg_bytes.Control::1 112 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0 128 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::1 152 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2 1872 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::3 2376 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2 12200 +system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4 128 +system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::5 152 +system.tcp_cntrl1.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl1.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl1.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl1.L1cache.num_data_array_reads 7 # number of data array reads +system.tcp_cntrl1.L1cache.num_data_array_writes 11 # number of data array writes +system.tcp_cntrl1.L1cache.num_tag_array_reads 25 # number of tag array reads +system.tcp_cntrl1.L1cache.num_tag_array_writes 18 # number of tag array writes +system.tcp_cntrl1.L1cache.num_tag_array_stalls 2 # number of stalls caused by tag array +system.tcp_cntrl1.L1cache.num_data_array_stalls 2 # number of stalls caused by data array +system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits 3 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers 2 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.gpu_tcp_st_hits 4 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_st_misses 5 # stores that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.sqc_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.sqc_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.sqc_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.sqc_cntrl0.L1cache.num_data_array_reads 86 # number of data array reads +system.sqc_cntrl0.L1cache.num_data_array_writes 5 # number of data array writes +system.sqc_cntrl0.L1cache.num_tag_array_reads 86 # number of tag array reads +system.sqc_cntrl0.L1cache.num_tag_array_writes 5 # number of tag array writes +system.sqc_cntrl0.L1cache.num_data_array_stalls 44 # number of stalls caused by data array +system.sqc_cntrl0.sequencer.load_waiting_on_load 120 # Number of times a load aliased with a pending load +system.tcc_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.tcc_cntrl0.L2cache.demand_misses 0 # Number of cache demand misses +system.tcc_cntrl0.L2cache.demand_accesses 0 # Number of cache demand accesses +system.tccdir_cntrl0.directory.demand_hits 0 # Number of cache demand hits +system.tccdir_cntrl0.directory.demand_misses 0 # Number of cache demand misses +system.tccdir_cntrl0.directory.demand_accesses 0 # Number of cache demand accesses +system.tccdir_cntrl0.directory.num_tag_array_reads 1554 # number of tag array reads +system.tccdir_cntrl0.directory.num_tag_array_writes 27 # number of tag array writes +system.ruby.network.msg_count.Control 3116 +system.ruby.network.msg_count.Request_Control 3121 +system.ruby.network.msg_count.Response_Data 3159 +system.ruby.network.msg_count.Response_Control 3078 +system.ruby.network.msg_count.Unblock_Control 3121 +system.ruby.network.msg_byte.Control 24928 +system.ruby.network.msg_byte.Request_Control 24968 +system.ruby.network.msg_byte.Response_Data 227448 +system.ruby.network.msg_byte.Response_Control 24624 +system.ruby.network.msg_byte.Unblock_Control 24968 +system.sqc_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.sqc_coalescer.uncoalesced_accesses 86 # Number of uncoalesced TLB accesses +system.sqc_coalescer.coalesced_accesses 63 # Number of coalesced TLB accesses +system.sqc_coalescer.queuing_cycles 100000 # Number of cycles spent in queue +system.sqc_coalescer.local_queuing_cycles 100000 # Number of cycles spent in queue for all incoming reqs +system.sqc_coalescer.local_latency 1162.790698 # Avg. latency over all incoming pkts +system.sqc_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_tlb.clk_domain.clock 1000 # Clock period in ticks +system.sqc_tlb.local_TLB_accesses 63 # Number of TLB accesses +system.sqc_tlb.local_TLB_hits 62 # Number of TLB hits +system.sqc_tlb.local_TLB_misses 1 # Number of TLB misses +system.sqc_tlb.local_TLB_miss_rate 1.587302 # TLB miss rate +system.sqc_tlb.global_TLB_accesses 86 # Number of TLB accesses +system.sqc_tlb.global_TLB_hits 78 # Number of TLB hits +system.sqc_tlb.global_TLB_misses 8 # Number of TLB misses +system.sqc_tlb.global_TLB_miss_rate 9.302326 # TLB miss rate +system.sqc_tlb.access_cycles 86008 # Cycles spent accessing this TLB level +system.sqc_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.sqc_tlb.unique_pages 1 # Number of unique pages touched +system.sqc_tlb.local_cycles 63001 # Number of cycles spent in queue for all incoming reqs +system.sqc_tlb.local_latency 1000.015873 # Avg. latency over incoming coalesced reqs +system.sqc_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.ruby.network.ext_links0.int_node.throttle0.link_utilization 0.005592 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0 1551 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2 12 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2 1539 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4 1551 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0 12408 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2 864 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2 12312 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4 12408 +system.ruby.network.ext_links0.int_node.throttle1.link_utilization 0.016287 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Control::0 16 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Data::2 1535 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Control::0 128 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Data::2 110520 +system.ruby.network.ext_links0.int_node.throttle2.link_utilization 0.001977 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Control::0 1535 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2 16 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Control::0 12280 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2 1152 +system.ruby.network.ext_links1.int_node.throttle0.link_utilization 0.016287 +system.ruby.network.ext_links1.int_node.throttle0.msg_count.Control::0 16 +system.ruby.network.ext_links1.int_node.throttle0.msg_count.Response_Data::2 1535 +system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Control::0 128 +system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Response_Data::2 110520 +system.ruby.network.ext_links1.int_node.throttle1.link_utilization 0.003653 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Request_Control::0 1535 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Data::2 2 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Control::2 14 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Unblock_Control::4 1535 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Request_Control::0 12280 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Data::2 144 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Control::2 112 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Unblock_Control::4 12280 +system.ruby.network.ext_links2.int_node.throttle0.link_utilization 0.000084 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Control::1 8 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::3 7 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Control::1 64 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::3 504 +system.ruby.network.ext_links2.int_node.throttle1.link_utilization 0.000081 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Control::1 6 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Data::3 7 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Control::1 48 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Data::3 504 +system.ruby.network.ext_links2.int_node.throttle2.link_utilization 0 +system.ruby.network.ext_links2.int_node.throttle3.link_utilization 0.002170 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Control::0 1535 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Request_Control::1 19 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::2 16 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::3 14 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Unblock_Control::5 19 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Control::0 12280 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Request_Control::1 152 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::2 1152 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::3 1008 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Unblock_Control::5 152 +system.ruby.network.ext_links2.int_node.throttle4.link_utilization 0.000053 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Response_Data::3 5 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Response_Data::3 360 +system.ruby.network.ext_links2.int_node.throttle5.link_utilization 0.001939 +system.ruby.network.ext_links2.int_node.throttle5.msg_count.Request_Control::0 16 +system.ruby.network.ext_links2.int_node.throttle5.msg_count.Response_Data::2 10 +system.ruby.network.ext_links2.int_node.throttle5.msg_count.Response_Control::2 1525 +system.ruby.network.ext_links2.int_node.throttle5.msg_count.Unblock_Control::4 16 +system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Request_Control::0 128 +system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Response_Data::2 720 +system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Response_Control::2 12200 +system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Unblock_Control::4 128 +system.ruby.CorePair_Controller.C0_Load_L1miss 180 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Load_L1hit 16155 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1hit 86007 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1miss 1088 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1hit 10448 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckS 1043 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckM 326 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckE 166 0.00% 0.00% +system.ruby.CorePair_Controller.L1I_Repl 589 0.00% 0.00% +system.ruby.CorePair_Controller.L1D0_Repl 24 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1D0 5 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1I 54 0.00% 0.00% +system.ruby.CorePair_Controller.PrbInvData 9 0.00% 0.00% +system.ruby.CorePair_Controller.PrbShrData 7 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Load_L1miss 175 0.00% 0.00% +system.ruby.CorePair_Controller.I.Ifetch0_L1miss 1034 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbInvData 8 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbShrData 5 0.00% 0.00% +system.ruby.CorePair_Controller.S.C0_Load_L1hit 635 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1hit 86007 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1miss 54 0.00% 0.00% +system.ruby.CorePair_Controller.S.L1I_Repl 589 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1hit 2721 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Store_L1hit 46 0.00% 0.00% +system.ruby.CorePair_Controller.E0.L1D0_Repl 16 0.00% 0.00% +system.ruby.CorePair_Controller.E0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.O.C0_Load_L1hit 3 0.00% 0.00% +system.ruby.CorePair_Controller.O.C0_Store_L1hit 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1miss 3 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1hit 12796 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Store_L1hit 10401 0.00% 0.00% +system.ruby.CorePair_Controller.M0.L1D0_Repl 8 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.NB_AckM 325 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckS 9 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckE 166 0.00% 0.00% +system.ruby.CorePair_Controller.Si_F0.L2_to_L1I 54 0.00% 0.00% +system.ruby.CorePair_Controller.O_M0.NB_AckM 1 0.00% 0.00% +system.ruby.CorePair_Controller.S0.NB_AckS 1034 0.00% 0.00% +system.ruby.CorePair_Controller.E0_F.L2_to_L1D0 2 0.00% 0.00% +system.ruby.CorePair_Controller.M0_F.L2_to_L1D0 3 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkS 1039 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkM 335 0.00% 0.00% +system.ruby.Directory_Controller.RdBlk 177 0.00% 0.00% +system.ruby.Directory_Controller.CPUPrbResp 1551 0.00% 0.00% +system.ruby.Directory_Controller.ProbeAcksComplete 1551 0.00% 0.00% +system.ruby.Directory_Controller.MemData 1551 0.00% 0.00% +system.ruby.Directory_Controller.CoreUnblock 1551 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkS 1039 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkM 335 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlk 177 0.00% 0.00% +system.ruby.Directory_Controller.BS_M.MemData 29 0.00% 0.00% +system.ruby.Directory_Controller.BM_M.MemData 12 0.00% 0.00% +system.ruby.Directory_Controller.B_M.MemData 1 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.CPUPrbResp 29 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete 29 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.MemData 1010 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.CPUPrbResp 12 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete 12 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.MemData 323 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.CPUPrbResp 1 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.ProbeAcksComplete 1 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.MemData 176 0.00% 0.00% +system.ruby.Directory_Controller.BS_Pm.CPUPrbResp 1010 0.00% 0.00% +system.ruby.Directory_Controller.BS_Pm.ProbeAcksComplete 1010 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.CPUPrbResp 323 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete 323 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.CPUPrbResp 176 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete 176 0.00% 0.00% +system.ruby.Directory_Controller.B.CoreUnblock 1551 0.00% 0.00% +system.ruby.LD.latency_hist::bucket_size 32 +system.ruby.LD.latency_hist::max_bucket 319 +system.ruby.LD.latency_hist::samples 16335 +system.ruby.LD.latency_hist::mean 4.217447 +system.ruby.LD.latency_hist::gmean 2.103537 +system.ruby.LD.latency_hist::stdev 21.286370 +system.ruby.LD.latency_hist | 16160 98.93% 98.93% | 0 0.00% 98.93% | 0 0.00% 98.93% | 0 0.00% 98.93% | 0 0.00% 98.93% | 0 0.00% 98.93% | 166 1.02% 99.94% | 9 0.06% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.latency_hist::total 16335 +system.ruby.LD.hit_latency_hist::bucket_size 32 +system.ruby.LD.hit_latency_hist::max_bucket 319 +system.ruby.LD.hit_latency_hist::samples 175 +system.ruby.LD.hit_latency_hist::mean 208.468571 +system.ruby.LD.hit_latency_hist::gmean 208.231054 +system.ruby.LD.hit_latency_hist::stdev 10.632194 +system.ruby.LD.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 166 94.86% 94.86% | 9 5.14% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.hit_latency_hist::total 175 +system.ruby.LD.miss_latency_hist::bucket_size 4 +system.ruby.LD.miss_latency_hist::max_bucket 39 +system.ruby.LD.miss_latency_hist::samples 16160 +system.ruby.LD.miss_latency_hist::mean 2.005569 +system.ruby.LD.miss_latency_hist::gmean 2.001425 +system.ruby.LD.miss_latency_hist::stdev 0.316580 +system.ruby.LD.miss_latency_hist | 16155 99.97% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 5 0.03% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.miss_latency_hist::total 16160 +system.ruby.ST.latency_hist::bucket_size 64 +system.ruby.ST.latency_hist::max_bucket 639 +system.ruby.ST.latency_hist::samples 10412 +system.ruby.ST.latency_hist::mean 8.385709 +system.ruby.ST.latency_hist::gmean 2.308923 +system.ruby.ST.latency_hist::stdev 35.862445 +system.ruby.ST.latency_hist | 10090 96.91% 96.91% | 0 0.00% 96.91% | 0 0.00% 96.91% | 316 3.03% 99.94% | 3 0.03% 99.97% | 3 0.03% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.latency_hist::total 10412 +system.ruby.ST.hit_latency_hist::bucket_size 64 +system.ruby.ST.hit_latency_hist::max_bucket 639 +system.ruby.ST.hit_latency_hist::samples 322 +system.ruby.ST.hit_latency_hist::mean 208.484472 +system.ruby.ST.hit_latency_hist::gmean 208.014366 +system.ruby.ST.hit_latency_hist::stdev 16.327683 +system.ruby.ST.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 316 98.14% 98.14% | 3 0.93% 99.07% | 3 0.93% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.hit_latency_hist::total 322 +system.ruby.ST.miss_latency_hist::bucket_size 1 +system.ruby.ST.miss_latency_hist::max_bucket 9 +system.ruby.ST.miss_latency_hist::samples 10090 +system.ruby.ST.miss_latency_hist::mean 2 +system.ruby.ST.miss_latency_hist::gmean 2.000000 +system.ruby.ST.miss_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10090 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.miss_latency_hist::total 10090 +system.ruby.IFETCH.latency_hist::bucket_size 64 +system.ruby.IFETCH.latency_hist::max_bucket 639 +system.ruby.IFETCH.latency_hist::samples 87095 +system.ruby.IFETCH.latency_hist::mean 4.462093 +system.ruby.IFETCH.latency_hist::gmean 2.116390 +system.ruby.IFETCH.latency_hist::stdev 22.435279 +system.ruby.IFETCH.latency_hist | 86061 98.81% 98.81% | 0 0.00% 98.81% | 0 0.00% 98.81% | 1011 1.16% 99.97% | 16 0.02% 99.99% | 7 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.latency_hist::total 87095 +system.ruby.IFETCH.hit_latency_hist::bucket_size 64 +system.ruby.IFETCH.hit_latency_hist::max_bucket 639 +system.ruby.IFETCH.hit_latency_hist::samples 1034 +system.ruby.IFETCH.hit_latency_hist::mean 208.444874 +system.ruby.IFETCH.hit_latency_hist::gmean 207.968565 +system.ruby.IFETCH.hit_latency_hist::stdev 16.462617 +system.ruby.IFETCH.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1011 97.78% 97.78% | 16 1.55% 99.32% | 7 0.68% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.hit_latency_hist::total 1034 +system.ruby.IFETCH.miss_latency_hist::bucket_size 4 +system.ruby.IFETCH.miss_latency_hist::max_bucket 39 +system.ruby.IFETCH.miss_latency_hist::samples 86061 +system.ruby.IFETCH.miss_latency_hist::mean 2.011294 +system.ruby.IFETCH.miss_latency_hist::gmean 2.002892 +system.ruby.IFETCH.miss_latency_hist::stdev 0.450747 +system.ruby.IFETCH.miss_latency_hist | 86007 99.94% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 54 0.06% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.miss_latency_hist::total 86061 +system.ruby.RMW_Read.latency_hist::bucket_size 32 +system.ruby.RMW_Read.latency_hist::max_bucket 319 +system.ruby.RMW_Read.latency_hist::samples 341 +system.ruby.RMW_Read.latency_hist::mean 4.392962 +system.ruby.RMW_Read.latency_hist::gmean 2.111743 +system.ruby.RMW_Read.latency_hist::stdev 21.996747 +system.ruby.RMW_Read.latency_hist | 337 98.83% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 4 1.17% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.latency_hist::total 341 +system.ruby.RMW_Read.hit_latency_hist::bucket_size 32 +system.ruby.RMW_Read.hit_latency_hist::max_bucket 319 +system.ruby.RMW_Read.hit_latency_hist::samples 4 +system.ruby.RMW_Read.hit_latency_hist::mean 206 +system.ruby.RMW_Read.hit_latency_hist::gmean 206.000000 +system.ruby.RMW_Read.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 4 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.hit_latency_hist::total 4 +system.ruby.RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.RMW_Read.miss_latency_hist::samples 337 +system.ruby.RMW_Read.miss_latency_hist::mean 2 +system.ruby.RMW_Read.miss_latency_hist::gmean 2.000000 +system.ruby.RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.miss_latency_hist::total 337 +system.ruby.Locked_RMW_Read.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.latency_hist::samples 10 +system.ruby.Locked_RMW_Read.latency_hist::mean 2 +system.ruby.Locked_RMW_Read.latency_hist::gmean 2 +system.ruby.Locked_RMW_Read.latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.latency_hist::total 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::mean 2 +system.ruby.Locked_RMW_Read.miss_latency_hist::gmean 2 +system.ruby.Locked_RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.miss_latency_hist::total 10 +system.ruby.Locked_RMW_Write.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.latency_hist::samples 10 +system.ruby.Locked_RMW_Write.latency_hist::mean 2 +system.ruby.Locked_RMW_Write.latency_hist::gmean 2 +system.ruby.Locked_RMW_Write.latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.latency_hist::total 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::mean 2 +system.ruby.Locked_RMW_Write.miss_latency_hist::gmean 2 +system.ruby.Locked_RMW_Write.miss_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.miss_latency_hist::total 10 +system.ruby.L1Cache.miss_mach_latency_hist::bucket_size 1 +system.ruby.L1Cache.miss_mach_latency_hist::max_bucket 9 +system.ruby.L1Cache.miss_mach_latency_hist::samples 112609 +system.ruby.L1Cache.miss_mach_latency_hist::mean 2 +system.ruby.L1Cache.miss_mach_latency_hist::gmean 2.000000 +system.ruby.L1Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 112609 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L1Cache.miss_mach_latency_hist::total 112609 +system.ruby.L2Cache.miss_mach_latency_hist::bucket_size 4 +system.ruby.L2Cache.miss_mach_latency_hist::max_bucket 39 +system.ruby.L2Cache.miss_mach_latency_hist::samples 59 +system.ruby.L2Cache.miss_mach_latency_hist::mean 20 +system.ruby.L2Cache.miss_mach_latency_hist::gmean 20.000000 +system.ruby.L2Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 59 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L2Cache.miss_mach_latency_hist::total 59 +system.ruby.Directory.hit_mach_latency_hist::bucket_size 64 +system.ruby.Directory.hit_mach_latency_hist::max_bucket 639 +system.ruby.Directory.hit_mach_latency_hist::samples 1535 +system.ruby.Directory.hit_mach_latency_hist::mean 208.449511 +system.ruby.Directory.hit_mach_latency_hist::gmean 208.002927 +system.ruby.Directory.hit_mach_latency_hist::stdev 15.847049 +system.ruby.Directory.hit_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1506 98.11% 98.11% | 19 1.24% 99.35% | 10 0.65% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Directory.hit_mach_latency_hist::total 1535 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples 16155 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean 2 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean 2.000000 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 16155 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total 16155 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size 4 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket 39 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples 5 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean 20 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean 20.000000 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 5 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total 5 +system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size 32 +system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket 319 +system.ruby.LD.Directory.hit_type_mach_latency_hist::samples 175 +system.ruby.LD.Directory.hit_type_mach_latency_hist::mean 208.468571 +system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean 208.231054 +system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev 10.632194 +system.ruby.LD.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 166 94.86% 94.86% | 9 5.14% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.Directory.hit_type_mach_latency_hist::total 175 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples 10090 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean 2 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean 2.000000 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10090 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total 10090 +system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.ST.Directory.hit_type_mach_latency_hist::samples 322 +system.ruby.ST.Directory.hit_type_mach_latency_hist::mean 208.484472 +system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean 208.014366 +system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev 16.327683 +system.ruby.ST.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 316 98.14% 98.14% | 3 0.93% 99.07% | 3 0.93% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.Directory.hit_type_mach_latency_hist::total 322 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples 86007 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean 2 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean 2.000000 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 86007 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total 86007 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size 4 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket 39 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples 54 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean 20 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean 20.000000 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 54 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total 54 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples 1034 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean 208.444874 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean 207.968565 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev 16.462617 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1011 97.78% 97.78% | 16 1.55% 99.32% | 7 0.68% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total 1034 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 337 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 2 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 2.000000 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total 337 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size 32 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket 319 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples 4 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean 206 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean 206.000000 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 4 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total 4 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 2 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 2 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean 2 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean 2 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.SQC_Controller.Fetch 86 0.00% 0.00% +system.ruby.SQC_Controller.TCC_AckS 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Fetch 5 0.00% 0.00% +system.ruby.SQC_Controller.S.Fetch 81 0.00% 0.00% +system.ruby.SQC_Controller.I_S.TCC_AckS 5 0.00% 0.00% +system.ruby.TCCdir_Controller.RdBlk 53 0.00% 0.00% +system.ruby.TCCdir_Controller.RdBlkM 36 0.00% 0.00% +system.ruby.TCCdir_Controller.RdBlkS 5 0.00% 0.00% +system.ruby.TCCdir_Controller.CPUPrbResp 14 0.00% 0.00% +system.ruby.TCCdir_Controller.ProbeAcksComplete 13 0.00% 0.00% +system.ruby.TCCdir_Controller.CoreUnblock 17 0.00% 0.00% +system.ruby.TCCdir_Controller.LastCoreUnblock 2 0.00% 0.00% +system.ruby.TCCdir_Controller.NB_AckS 7 0.00% 0.00% +system.ruby.TCCdir_Controller.NB_AckM 9 0.00% 0.00% +system.ruby.TCCdir_Controller.PrbInvData 326 0.00% 0.00% +system.ruby.TCCdir_Controller.PrbShrData 1209 0.00% 0.00% +system.ruby.TCCdir_Controller.I.RdBlk 2 0.00% 0.00% +system.ruby.TCCdir_Controller.I.RdBlkM 9 0.00% 0.00% +system.ruby.TCCdir_Controller.I.RdBlkS 5 0.00% 0.00% +system.ruby.TCCdir_Controller.I.PrbInvData 325 0.00% 0.00% +system.ruby.TCCdir_Controller.I.PrbShrData 1200 0.00% 0.00% +system.ruby.TCCdir_Controller.S.RdBlk 2 0.00% 0.00% +system.ruby.TCCdir_Controller.S.PrbInvData 1 0.00% 0.00% +system.ruby.TCCdir_Controller.M.RdBlkM 1 0.00% 0.00% +system.ruby.TCCdir_Controller.M.PrbShrData 9 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.CPUPrbResp 2 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.ProbeAcksComplete 1 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_O.CPUPrbResp 9 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_O.ProbeAcksComplete 9 0.00% 0.00% +system.ruby.TCCdir_Controller.I_M.RdBlkM 22 0.00% 0.00% +system.ruby.TCCdir_Controller.I_M.NB_AckM 9 0.00% 0.00% +system.ruby.TCCdir_Controller.I_ES.RdBlk 41 0.00% 0.00% +system.ruby.TCCdir_Controller.I_ES.NB_AckS 2 0.00% 0.00% +system.ruby.TCCdir_Controller.I_S.NB_AckS 5 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_S.CPUPrbResp 2 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_S.ProbeAcksComplete 2 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.CPUPrbResp 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.ProbeAcksComplete 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_M.CoreUnblock 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_S.LastCoreUnblock 2 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_S.RdBlk 8 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_S.CoreUnblock 7 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_M.RdBlkM 4 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_M.CoreUnblock 9 0.00% 0.00% +system.ruby.TCP_Controller.Load | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.Load::total 10 +system.ruby.TCP_Controller.Store | 9 50.00% 50.00% | 9 50.00% 100.00% +system.ruby.TCP_Controller.Store::total 18 +system.ruby.TCP_Controller.TCC_AckS | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.TCC_AckS::total 4 +system.ruby.TCP_Controller.TCC_AckM | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.TCC_AckM::total 10 +system.ruby.TCP_Controller.PrbInvData | 1 33.33% 33.33% | 2 66.67% 100.00% +system.ruby.TCP_Controller.PrbInvData::total 3 +system.ruby.TCP_Controller.PrbShrData | 7 63.64% 63.64% | 4 36.36% 100.00% +system.ruby.TCP_Controller.PrbShrData::total 11 +system.ruby.TCP_Controller.I.Load | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.Load::total 4 +system.ruby.TCP_Controller.I.Store | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.I.Store::total 10 +system.ruby.TCP_Controller.S.Load | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.S.Load::total 6 +system.ruby.TCP_Controller.S.PrbInvData | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.S.PrbInvData::total 2 +system.ruby.TCP_Controller.S.PrbShrData | 2 100.00% 100.00% | 0 0.00% 100.00% +system.ruby.TCP_Controller.S.PrbShrData::total 2 +system.ruby.TCP_Controller.M.Store | 4 50.00% 50.00% | 4 50.00% 100.00% +system.ruby.TCP_Controller.M.Store::total 8 +system.ruby.TCP_Controller.M.PrbInvData | 0 0.00% 0.00% | 1 100.00% 100.00% +system.ruby.TCP_Controller.M.PrbInvData::total 1 +system.ruby.TCP_Controller.M.PrbShrData | 5 55.56% 55.56% | 4 44.44% 100.00% +system.ruby.TCP_Controller.M.PrbShrData::total 9 +system.ruby.TCP_Controller.I_M.TCC_AckM | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.I_M.TCC_AckM::total 10 +system.ruby.TCP_Controller.I_ES.TCC_AckS | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I_ES.TCC_AckS::total 4 + +---------- End Simulation Statistics ---------- diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/config.ini new file mode 100644 index 000000000..33ae7164f --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/config.ini @@ -0,0 +1,4063 @@ +[root] +type=Root +children=system +eventq_index=0 +full_system=false +sim_quantum=0 +time_sync_enable=false +time_sync_period=100000000000 +time_sync_spin_threshold=100000000 + +[system] +type=System +children=clk_domain cpu0 cpu1 cpu2 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus ruby sqc_coalescer sqc_tlb sys_port_proxy voltage_domain +boot_osflags=a +cache_line_size=64 +clk_domain=system.clk_domain +eventq_index=0 +exit_on_work_items=false +init_param=0 +kernel= +kernel_addr_check=true +load_addr_mask=1099511627775 +load_offset=0 +mem_mode=timing +mem_ranges=0:536870911 +memories=system.mem_ctrls system.ruby.phys_mem +mmap_using_noreserve=false +multi_thread=false +num_work_ids=16 +readfile= +symbolfile= +work_begin_ckpt_count=0 +work_begin_cpu_id_exit=-1 +work_begin_exit_count=0 +work_cpus_ckpt_count=0 +work_end_ckpt_count=0 +work_end_exit_count=0 +work_item_id=-1 +system_port=system.sys_port_proxy.slave[0] + +[system.clk_domain] +type=SrcClockDomain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cpu0] +type=TimingSimpleCPU +children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload +branchPred=Null +checker=Null +clk_domain=system.cpu0.clk_domain +cpu_id=0 +do_checkpoint_insts=true +do_quiesce=true +do_statistics_insts=true +dtb=system.cpu0.dtb +eventq_index=0 +function_trace=false +function_trace_start=0 +interrupts=system.cpu0.interrupts +isa=system.cpu0.isa +itb=system.cpu0.itb +max_insts_all_threads=0 +max_insts_any_thread=0 +max_loads_all_threads=0 +max_loads_any_thread=0 +numThreads=1 +profile=0 +progress_interval=0 +simpoint_start_insts= +socket_id=0 +switched_out=false +system=system +tracer=system.cpu0.tracer +workload=system.cpu0.workload +dcache_port=system.ruby.cp_cntrl0.sequencer.slave[1] +icache_port=system.ruby.cp_cntrl0.sequencer.slave[0] + +[system.cpu0.apic_clk_domain] +type=DerivedClockDomain +clk_divider=16 +clk_domain=system.cpu0.clk_domain +eventq_index=0 + +[system.cpu0.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cpu0.dtb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.dtb.walker + +[system.cpu0.dtb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.ruby.cp_cntrl0.sequencer.slave[3] + +[system.cpu0.interrupts] +type=X86LocalApic +clk_domain=system.cpu0.apic_clk_domain +eventq_index=0 +int_latency=1000 +pio_addr=2305843009213693952 +pio_latency=100000 +system=system +int_master=system.ruby.cp_cntrl0.sequencer.slave[4] +int_slave=system.ruby.cp_cntrl0.sequencer.master[1] +pio=system.ruby.cp_cntrl0.sequencer.master[0] + +[system.cpu0.isa] +type=X86ISA +eventq_index=0 + +[system.cpu0.itb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.itb.walker + +[system.cpu0.itb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.ruby.cp_cntrl0.sequencer.slave[2] + +[system.cpu0.tracer] +type=ExeTracer +eventq_index=0 + +[system.cpu0.workload] +type=LiveProcess +cmd=gpu-hello +cwd= +drivers=system.cpu2.cl_driver +egid=100 +env= +errout=cerr +euid=100 +eventq_index=0 +executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello +gid=100 +input=cin +kvmInSE=false +max_stack_size=67108864 +output=cout +pid=100 +ppid=99 +simpoint=0 +system=system +uid=100 +useArchPT=false + +[system.cpu1] +type=Shader +children=CUs0 CUs1 clk_domain +CUs=system.cpu1.CUs0 system.cpu1.CUs1 +clk_domain=system.cpu1.clk_domain +cpu_pointer=system.cpu0 +eventq_index=0 +globalmem=65536 +impl_kern_boundary_sync=true +n_wf=8 +separate_acquire_release=false +timing=true +translation=false + +[system.cpu1.CUs0] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=0 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs0.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs0.ldsBus.slave +memory_port=system.ruby.tcp_cntrl0.coalescer.slave[0] system.ruby.tcp_cntrl0.coalescer.slave[1] system.ruby.tcp_cntrl0.coalescer.slave[2] system.ruby.tcp_cntrl0.coalescer.slave[3] system.ruby.tcp_cntrl0.coalescer.slave[4] system.ruby.tcp_cntrl0.coalescer.slave[5] system.ruby.tcp_cntrl0.coalescer.slave[6] system.ruby.tcp_cntrl0.coalescer.slave[7] system.ruby.tcp_cntrl0.coalescer.slave[8] system.ruby.tcp_cntrl0.coalescer.slave[9] system.ruby.tcp_cntrl0.coalescer.slave[10] system.ruby.tcp_cntrl0.coalescer.slave[11] system.ruby.tcp_cntrl0.coalescer.slave[12] system.ruby.tcp_cntrl0.coalescer.slave[13] system.ruby.tcp_cntrl0.coalescer.slave[14] system.ruby.tcp_cntrl0.coalescer.slave[15] system.ruby.tcp_cntrl0.coalescer.slave[16] system.ruby.tcp_cntrl0.coalescer.slave[17] system.ruby.tcp_cntrl0.coalescer.slave[18] system.ruby.tcp_cntrl0.coalescer.slave[19] system.ruby.tcp_cntrl0.coalescer.slave[20] system.ruby.tcp_cntrl0.coalescer.slave[21] system.ruby.tcp_cntrl0.coalescer.slave[22] system.ruby.tcp_cntrl0.coalescer.slave[23] system.ruby.tcp_cntrl0.coalescer.slave[24] system.ruby.tcp_cntrl0.coalescer.slave[25] system.ruby.tcp_cntrl0.coalescer.slave[26] system.ruby.tcp_cntrl0.coalescer.slave[27] system.ruby.tcp_cntrl0.coalescer.slave[28] system.ruby.tcp_cntrl0.coalescer.slave[29] system.ruby.tcp_cntrl0.coalescer.slave[30] system.ruby.tcp_cntrl0.coalescer.slave[31] system.ruby.tcp_cntrl0.coalescer.slave[32] system.ruby.tcp_cntrl0.coalescer.slave[33] system.ruby.tcp_cntrl0.coalescer.slave[34] system.ruby.tcp_cntrl0.coalescer.slave[35] system.ruby.tcp_cntrl0.coalescer.slave[36] system.ruby.tcp_cntrl0.coalescer.slave[37] system.ruby.tcp_cntrl0.coalescer.slave[38] system.ruby.tcp_cntrl0.coalescer.slave[39] system.ruby.tcp_cntrl0.coalescer.slave[40] system.ruby.tcp_cntrl0.coalescer.slave[41] system.ruby.tcp_cntrl0.coalescer.slave[42] system.ruby.tcp_cntrl0.coalescer.slave[43] system.ruby.tcp_cntrl0.coalescer.slave[44] system.ruby.tcp_cntrl0.coalescer.slave[45] system.ruby.tcp_cntrl0.coalescer.slave[46] system.ruby.tcp_cntrl0.coalescer.slave[47] system.ruby.tcp_cntrl0.coalescer.slave[48] system.ruby.tcp_cntrl0.coalescer.slave[49] system.ruby.tcp_cntrl0.coalescer.slave[50] system.ruby.tcp_cntrl0.coalescer.slave[51] system.ruby.tcp_cntrl0.coalescer.slave[52] system.ruby.tcp_cntrl0.coalescer.slave[53] system.ruby.tcp_cntrl0.coalescer.slave[54] system.ruby.tcp_cntrl0.coalescer.slave[55] system.ruby.tcp_cntrl0.coalescer.slave[56] system.ruby.tcp_cntrl0.coalescer.slave[57] system.ruby.tcp_cntrl0.coalescer.slave[58] system.ruby.tcp_cntrl0.coalescer.slave[59] system.ruby.tcp_cntrl0.coalescer.slave[60] system.ruby.tcp_cntrl0.coalescer.slave[61] system.ruby.tcp_cntrl0.coalescer.slave[62] system.ruby.tcp_cntrl0.coalescer.slave[63] +sqc_port=system.ruby.sqc_cntrl0.sequencer.slave[0] +sqc_tlb_port=system.sqc_coalescer.slave[0] +translation_port=system.l1_coalescer0.slave[0] + +[system.cpu1.CUs0.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs0.localDataStore.cuPort +slave=system.cpu1.CUs0.ldsPort + +[system.cpu1.CUs0.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs0.ldsBus.master + +[system.cpu1.CUs0.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs0.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs0.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs0.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs0.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.CUs1] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=1 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs1.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs1.ldsBus.slave +memory_port=system.ruby.tcp_cntrl1.coalescer.slave[0] system.ruby.tcp_cntrl1.coalescer.slave[1] system.ruby.tcp_cntrl1.coalescer.slave[2] system.ruby.tcp_cntrl1.coalescer.slave[3] system.ruby.tcp_cntrl1.coalescer.slave[4] system.ruby.tcp_cntrl1.coalescer.slave[5] system.ruby.tcp_cntrl1.coalescer.slave[6] system.ruby.tcp_cntrl1.coalescer.slave[7] system.ruby.tcp_cntrl1.coalescer.slave[8] system.ruby.tcp_cntrl1.coalescer.slave[9] system.ruby.tcp_cntrl1.coalescer.slave[10] system.ruby.tcp_cntrl1.coalescer.slave[11] system.ruby.tcp_cntrl1.coalescer.slave[12] system.ruby.tcp_cntrl1.coalescer.slave[13] system.ruby.tcp_cntrl1.coalescer.slave[14] system.ruby.tcp_cntrl1.coalescer.slave[15] system.ruby.tcp_cntrl1.coalescer.slave[16] system.ruby.tcp_cntrl1.coalescer.slave[17] system.ruby.tcp_cntrl1.coalescer.slave[18] system.ruby.tcp_cntrl1.coalescer.slave[19] system.ruby.tcp_cntrl1.coalescer.slave[20] system.ruby.tcp_cntrl1.coalescer.slave[21] system.ruby.tcp_cntrl1.coalescer.slave[22] system.ruby.tcp_cntrl1.coalescer.slave[23] system.ruby.tcp_cntrl1.coalescer.slave[24] system.ruby.tcp_cntrl1.coalescer.slave[25] system.ruby.tcp_cntrl1.coalescer.slave[26] system.ruby.tcp_cntrl1.coalescer.slave[27] system.ruby.tcp_cntrl1.coalescer.slave[28] system.ruby.tcp_cntrl1.coalescer.slave[29] system.ruby.tcp_cntrl1.coalescer.slave[30] system.ruby.tcp_cntrl1.coalescer.slave[31] system.ruby.tcp_cntrl1.coalescer.slave[32] system.ruby.tcp_cntrl1.coalescer.slave[33] system.ruby.tcp_cntrl1.coalescer.slave[34] system.ruby.tcp_cntrl1.coalescer.slave[35] system.ruby.tcp_cntrl1.coalescer.slave[36] system.ruby.tcp_cntrl1.coalescer.slave[37] system.ruby.tcp_cntrl1.coalescer.slave[38] system.ruby.tcp_cntrl1.coalescer.slave[39] system.ruby.tcp_cntrl1.coalescer.slave[40] system.ruby.tcp_cntrl1.coalescer.slave[41] system.ruby.tcp_cntrl1.coalescer.slave[42] system.ruby.tcp_cntrl1.coalescer.slave[43] system.ruby.tcp_cntrl1.coalescer.slave[44] system.ruby.tcp_cntrl1.coalescer.slave[45] system.ruby.tcp_cntrl1.coalescer.slave[46] system.ruby.tcp_cntrl1.coalescer.slave[47] system.ruby.tcp_cntrl1.coalescer.slave[48] system.ruby.tcp_cntrl1.coalescer.slave[49] system.ruby.tcp_cntrl1.coalescer.slave[50] system.ruby.tcp_cntrl1.coalescer.slave[51] system.ruby.tcp_cntrl1.coalescer.slave[52] system.ruby.tcp_cntrl1.coalescer.slave[53] system.ruby.tcp_cntrl1.coalescer.slave[54] system.ruby.tcp_cntrl1.coalescer.slave[55] system.ruby.tcp_cntrl1.coalescer.slave[56] system.ruby.tcp_cntrl1.coalescer.slave[57] system.ruby.tcp_cntrl1.coalescer.slave[58] system.ruby.tcp_cntrl1.coalescer.slave[59] system.ruby.tcp_cntrl1.coalescer.slave[60] system.ruby.tcp_cntrl1.coalescer.slave[61] system.ruby.tcp_cntrl1.coalescer.slave[62] system.ruby.tcp_cntrl1.coalescer.slave[63] +sqc_port=system.ruby.sqc_cntrl0.sequencer.slave[1] +sqc_tlb_port=system.sqc_coalescer.slave[1] +translation_port=system.l1_coalescer1.slave[0] + +[system.cpu1.CUs1.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs1.localDataStore.cuPort +slave=system.cpu1.CUs1.ldsPort + +[system.cpu1.CUs1.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs1.ldsBus.master + +[system.cpu1.CUs1.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs1.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs1.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs1.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs1.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.cpu1.clk_domain.voltage_domain + +[system.cpu1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.cpu2] +type=GpuDispatcher +children=cl_driver +cl_driver=system.cpu2.cl_driver +clk_domain=system.clk_domain +cpu=system.cpu0 +eventq_index=0 +pio_addr=8589934592 +pio_latency=1000 +shader_pointer=system.cpu1 +system=system +dma=system.piobus.slave[1] +pio=system.piobus.master[0] +translation_port=system.dispatcher_coalescer.slave[0] + +[system.cpu2.cl_driver] +type=ClDriver +codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +eventq_index=0 +filename=hsa + +[system.dispatcher_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.dispatcher_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.dispatcher_tlb.slave[0] +slave=system.cpu2.translation_port + +[system.dispatcher_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain + +[system.dispatcher_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dispatcher_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.dispatcher_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[1] +slave=system.dispatcher_coalescer.master[0] + +[system.dispatcher_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain + +[system.dispatcher_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dvfs_handler] +type=DVFSHandler +domains= +enable=false +eventq_index=0 +sys_clk_domain=system.clk_domain +transition_latency=100000000 + +[system.l1_coalescer0] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer0.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb0.slave[0] +slave=system.cpu1.CUs0.translation_port[0] + +[system.l1_coalescer0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain + +[system.l1_coalescer0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_coalescer1] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer1.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb1.slave[0] +slave=system.cpu1.CUs1.translation_port[0] + +[system.l1_coalescer1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain + +[system.l1_coalescer1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb0] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb0.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[2] +slave=system.l1_coalescer0.master[0] + +[system.l1_tlb0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb0.clk_domain.voltage_domain + +[system.l1_tlb0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb1] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb1.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[3] +slave=system.l1_coalescer1.master[0] + +[system.l1_tlb1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb1.clk_domain.voltage_domain + +[system.l1_tlb1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l2_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l2_tlb.slave[0] +slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0] + +[system.l2_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_coalescer.clk_domain.voltage_domain + +[system.l2_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l2_tlb.clk_domain +eventq_index=0 +hitLatency=69 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=4096 +master=system.l3_coalescer.slave[0] +slave=system.l2_coalescer.master[0] + +[system.l2_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_tlb.clk_domain.voltage_domain + +[system.l2_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l3_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l3_tlb.slave[0] +slave=system.l2_tlb.master[0] + +[system.l3_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_coalescer.clk_domain.voltage_domain + +[system.l3_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l3_tlb.clk_domain +eventq_index=0 +hitLatency=150 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=8192 +slave=system.l3_coalescer.master[0] + +[system.l3_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_tlb.clk_domain.voltage_domain + +[system.l3_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.mem_ctrls] +type=DRAMCtrl +IDD0=0.075000 +IDD02=0.000000 +IDD2N=0.050000 +IDD2N2=0.000000 +IDD2P0=0.000000 +IDD2P02=0.000000 +IDD2P1=0.000000 +IDD2P12=0.000000 +IDD3N=0.057000 +IDD3N2=0.000000 +IDD3P0=0.000000 +IDD3P02=0.000000 +IDD3P1=0.000000 +IDD3P12=0.000000 +IDD4R=0.187000 +IDD4R2=0.000000 +IDD4W=0.165000 +IDD4W2=0.000000 +IDD5=0.220000 +IDD52=0.000000 +IDD6=0.000000 +IDD62=0.000000 +VDD=1.500000 +VDD2=0.000000 +activation_limit=4 +addr_mapping=RoRaBaCoCh +bank_groups_per_rank=0 +banks_per_rank=8 +burst_length=8 +channels=1 +clk_domain=system.clk_domain +conf_table_reported=true +device_bus_width=8 +device_rowbuffer_size=1024 +device_size=536870912 +devices_per_rank=8 +dll=true +eventq_index=0 +in_addr_map=true +max_accesses_per_row=16 +mem_sched_policy=frfcfs +min_writes_per_switch=16 +null=false +page_policy=open_adaptive +range=0:536870911 +ranks_per_channel=2 +read_buffer_size=32 +static_backend_latency=10000 +static_frontend_latency=10000 +tBURST=5000 +tCCD_L=0 +tCK=1250 +tCL=13750 +tCS=2500 +tRAS=35000 +tRCD=13750 +tREFI=7800000 +tRFC=260000 +tRP=13750 +tRRD=6000 +tRRD_L=0 +tRTP=7500 +tRTW=2500 +tWR=15000 +tWTR=7500 +tXAW=30000 +tXP=0 +tXPDLL=0 +tXS=0 +tXSDLL=0 +write_buffer_size=64 +write_high_thresh_perc=85 +write_low_thresh_perc=50 +port=system.ruby.dir_cntrl0.memory + +[system.piobus] +type=NoncoherentXBar +clk_domain=system.clk_domain +eventq_index=0 +forward_latency=0 +frontend_latency=0 +response_latency=0 +use_default_range=false +width=32 +master=system.cpu2.pio +slave=system.ruby.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma + +[system.ruby] +type=RubySystem +children=clk_domain cp_cntrl0 dir_cntrl0 network phys_mem sqc_cntrl0 tcc_cntrl0 tcp_cntrl0 tcp_cntrl1 +access_backing_store=true +all_instructions=false +block_size_bytes=64 +clk_domain=system.ruby.clk_domain +eventq_index=0 +hot_lines=false +memory_size_bits=48 +num_of_sequencers=5 +number_of_virtual_networks=10 +phys_mem=system.ruby.phys_mem +randomization=false + +[system.ruby.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.ruby.cp_cntrl0] +type=CorePair_Controller +children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore +L1D0cache=system.ruby.cp_cntrl0.L1D0cache +L1D1cache=system.ruby.cp_cntrl0.L1D1cache +L1Icache=system.ruby.cp_cntrl0.L1Icache +L2cache=system.ruby.cp_cntrl0.L2cache +buffer_size=0 +clk_domain=system.ruby.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=120 +l2_hit_latency=18 +mandatoryQueue=system.ruby.cp_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToCore=system.ruby.cp_cntrl0.probeToCore +recycle_latency=10 +requestFromCore=system.ruby.cp_cntrl0.requestFromCore +responseFromCore=system.ruby.cp_cntrl0.responseFromCore +responseToCore=system.ruby.cp_cntrl0.responseToCore +ruby_system=system.ruby +send_evictions=true +sequencer=system.ruby.cp_cntrl0.sequencer +sequencer1=system.ruby.cp_cntrl0.sequencer1 +system=system +transitions_per_cycle=32 +triggerQueue=system.ruby.cp_cntrl0.triggerQueue +unblockFromCore=system.ruby.cp_cntrl0.unblockFromCore +version=0 + +[system.ruby.cp_cntrl0.L1D0cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.cp_cntrl0.L1D0cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.ruby.cp_cntrl0.L1D0cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.ruby.cp_cntrl0.L1D1cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.cp_cntrl0.L1D1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.ruby.cp_cntrl0.L1D1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.ruby.cp_cntrl0.L1Icache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.cp_cntrl0.L1Icache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.ruby.cp_cntrl0.L1Icache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=32768 + +[system.ruby.cp_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.cp_cntrl0.L2cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=2097152 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.ruby.cp_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=2097152 + +[system.ruby.cp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.ruby.cp_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[3] + +[system.ruby.cp_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[2] + +[system.ruby.cp_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[3] + +[system.ruby.cp_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[4] + +[system.ruby.cp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.ruby.clk_domain +coreid=0 +dcache=system.ruby.cp_cntrl0.L1D0cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.cp_cntrl0.L1Icache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=0 +master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave +mem_master_port=system.piobus.slave[0] +slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master + +[system.ruby.cp_cntrl0.sequencer1] +type=RubySequencer +clk_domain=system.ruby.clk_domain +coreid=1 +dcache=system.ruby.cp_cntrl0.L1D1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.cp_cntrl0.L1Icache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=1 + +[system.ruby.cp_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.cp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[4] + +[system.ruby.dir_cntrl0] +type=Directory_Controller +children=L3CacheMemory L3triggerQueue directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores +CPUonly=false +L3CacheMemory=system.ruby.dir_cntrl0.L3CacheMemory +L3triggerQueue=system.ruby.dir_cntrl0.L3triggerQueue +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.ruby.clk_domain +cluster_id=0 +directory=system.ruby.dir_cntrl0.directory +eventq_index=0 +l3_hit_latency=15 +noTCCdir=true +number_of_TBEs=256 +probeToCore=system.ruby.dir_cntrl0.probeToCore +recycle_latency=10 +requestFromCores=system.ruby.dir_cntrl0.requestFromCores +responseFromCores=system.ruby.dir_cntrl0.responseFromCores +responseFromMemory=system.ruby.dir_cntrl0.responseFromMemory +responseToCore=system.ruby.dir_cntrl0.responseToCore +response_latency=30 +ruby_system=system.ruby +system=system +to_memory_controller_latency=1 +transitions_per_cycle=32 +triggerQueue=system.ruby.dir_cntrl0.triggerQueue +unblockFromCores=system.ruby.dir_cntrl0.unblockFromCores +useL3OnWT=false +version=0 +memory=system.mem_ctrls.port + +[system.ruby.dir_cntrl0.L3CacheMemory] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=20 +dataArrayBanks=16.0 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.dir_cntrl0.L3CacheMemory.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=16777216 +start_index_bit=6 +tagAccessLatency=15 +tagArrayBanks=16.0 + +[system.ruby.dir_cntrl0.L3CacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16777216 + +[system.ruby.dir_cntrl0.L3triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.dir_cntrl0.directory] +type=RubyDirectoryMemory +eventq_index=0 +numa_high_bit=5 +size=536870912 +version=0 + +[system.ruby.dir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[0] + +[system.ruby.dir_cntrl0.requestFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[0] + +[system.ruby.dir_cntrl0.responseFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[1] + +[system.ruby.dir_cntrl0.responseFromMemory] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.ruby.dir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[1] + +[system.ruby.dir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.dir_cntrl0.unblockFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[2] + +[system.ruby.network] +type=SimpleNetwork +children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1 +adaptive_routing=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +control_msg_size=8 +endpoint_bandwidth=1000 +eventq_index=0 +ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5 +int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39 +int_links=system.ruby.network.int_links0 system.ruby.network.int_links1 +netifs= +number_of_virtual_networks=10 +routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links1.int_node system.ruby.network.ext_links2.int_node +ruby_system=system.ruby +topology=Crossbar +master=system.ruby.dir_cntrl0.requestFromCores.slave system.ruby.dir_cntrl0.responseFromCores.slave system.ruby.dir_cntrl0.unblockFromCores.slave system.ruby.cp_cntrl0.probeToCore.slave system.ruby.cp_cntrl0.responseToCore.slave system.ruby.tcp_cntrl0.probeToTCP.slave system.ruby.tcp_cntrl0.responseToTCP.slave system.ruby.tcp_cntrl1.probeToTCP.slave system.ruby.tcp_cntrl1.responseToTCP.slave system.ruby.sqc_cntrl0.probeToSQC.slave system.ruby.sqc_cntrl0.responseToSQC.slave system.ruby.tcc_cntrl0.requestFromTCP.slave system.ruby.tcc_cntrl0.probeFromNB.slave system.ruby.tcc_cntrl0.responseFromNB.slave +slave=system.ruby.dir_cntrl0.probeToCore.master system.ruby.dir_cntrl0.responseToCore.master system.ruby.cp_cntrl0.requestFromCore.master system.ruby.cp_cntrl0.responseFromCore.master system.ruby.cp_cntrl0.unblockFromCore.master system.ruby.tcp_cntrl0.requestFromTCP.master system.ruby.tcp_cntrl0.responseFromTCP.master system.ruby.tcp_cntrl0.unblockFromCore.master system.ruby.tcp_cntrl1.requestFromTCP.master system.ruby.tcp_cntrl1.responseFromTCP.master system.ruby.tcp_cntrl1.unblockFromCore.master system.ruby.sqc_cntrl0.requestFromSQC.master system.ruby.tcc_cntrl0.responseToCore.master system.ruby.tcc_cntrl0.requestToNB.master system.ruby.tcc_cntrl0.responseToNB.master system.ruby.tcc_cntrl0.unblockToNB.master + +[system.ruby.network.ext_links0] +type=SimpleExtLink +children=int_node +bandwidth_factor=8 +eventq_index=0 +ext_node=system.ruby.dir_cntrl0 +int_node=system.ruby.network.ext_links0.int_node +latency=1 +link_id=0 +weight=1 + +[system.ruby.network.ext_links0.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links0.int_node.port_buffers00 system.ruby.network.ext_links0.int_node.port_buffers01 system.ruby.network.ext_links0.int_node.port_buffers02 system.ruby.network.ext_links0.int_node.port_buffers03 system.ruby.network.ext_links0.int_node.port_buffers04 system.ruby.network.ext_links0.int_node.port_buffers05 system.ruby.network.ext_links0.int_node.port_buffers06 system.ruby.network.ext_links0.int_node.port_buffers07 system.ruby.network.ext_links0.int_node.port_buffers08 system.ruby.network.ext_links0.int_node.port_buffers09 system.ruby.network.ext_links0.int_node.port_buffers10 system.ruby.network.ext_links0.int_node.port_buffers11 system.ruby.network.ext_links0.int_node.port_buffers12 system.ruby.network.ext_links0.int_node.port_buffers13 system.ruby.network.ext_links0.int_node.port_buffers14 system.ruby.network.ext_links0.int_node.port_buffers15 system.ruby.network.ext_links0.int_node.port_buffers16 system.ruby.network.ext_links0.int_node.port_buffers17 system.ruby.network.ext_links0.int_node.port_buffers18 system.ruby.network.ext_links0.int_node.port_buffers19 system.ruby.network.ext_links0.int_node.port_buffers20 system.ruby.network.ext_links0.int_node.port_buffers21 system.ruby.network.ext_links0.int_node.port_buffers22 system.ruby.network.ext_links0.int_node.port_buffers23 system.ruby.network.ext_links0.int_node.port_buffers24 system.ruby.network.ext_links0.int_node.port_buffers25 system.ruby.network.ext_links0.int_node.port_buffers26 system.ruby.network.ext_links0.int_node.port_buffers27 system.ruby.network.ext_links0.int_node.port_buffers28 system.ruby.network.ext_links0.int_node.port_buffers29 system.ruby.network.ext_links0.int_node.port_buffers30 system.ruby.network.ext_links0.int_node.port_buffers31 system.ruby.network.ext_links0.int_node.port_buffers32 system.ruby.network.ext_links0.int_node.port_buffers33 system.ruby.network.ext_links0.int_node.port_buffers34 system.ruby.network.ext_links0.int_node.port_buffers35 system.ruby.network.ext_links0.int_node.port_buffers36 system.ruby.network.ext_links0.int_node.port_buffers37 system.ruby.network.ext_links0.int_node.port_buffers38 system.ruby.network.ext_links0.int_node.port_buffers39 system.ruby.network.ext_links0.int_node.port_buffers40 system.ruby.network.ext_links0.int_node.port_buffers41 system.ruby.network.ext_links0.int_node.port_buffers42 system.ruby.network.ext_links0.int_node.port_buffers43 system.ruby.network.ext_links0.int_node.port_buffers44 system.ruby.network.ext_links0.int_node.port_buffers45 system.ruby.network.ext_links0.int_node.port_buffers46 system.ruby.network.ext_links0.int_node.port_buffers47 system.ruby.network.ext_links0.int_node.port_buffers48 system.ruby.network.ext_links0.int_node.port_buffers49 system.ruby.network.ext_links0.int_node.port_buffers50 system.ruby.network.ext_links0.int_node.port_buffers51 system.ruby.network.ext_links0.int_node.port_buffers52 system.ruby.network.ext_links0.int_node.port_buffers53 system.ruby.network.ext_links0.int_node.port_buffers54 system.ruby.network.ext_links0.int_node.port_buffers55 system.ruby.network.ext_links0.int_node.port_buffers56 system.ruby.network.ext_links0.int_node.port_buffers57 system.ruby.network.ext_links0.int_node.port_buffers58 system.ruby.network.ext_links0.int_node.port_buffers59 system.ruby.network.ext_links0.int_node.port_buffers60 system.ruby.network.ext_links0.int_node.port_buffers61 system.ruby.network.ext_links0.int_node.port_buffers62 system.ruby.network.ext_links0.int_node.port_buffers63 system.ruby.network.ext_links0.int_node.port_buffers64 system.ruby.network.ext_links0.int_node.port_buffers65 system.ruby.network.ext_links0.int_node.port_buffers66 system.ruby.network.ext_links0.int_node.port_buffers67 system.ruby.network.ext_links0.int_node.port_buffers68 system.ruby.network.ext_links0.int_node.port_buffers69 system.ruby.network.ext_links0.int_node.port_buffers70 system.ruby.network.ext_links0.int_node.port_buffers71 system.ruby.network.ext_links0.int_node.port_buffers72 system.ruby.network.ext_links0.int_node.port_buffers73 system.ruby.network.ext_links0.int_node.port_buffers74 system.ruby.network.ext_links0.int_node.port_buffers75 system.ruby.network.ext_links0.int_node.port_buffers76 system.ruby.network.ext_links0.int_node.port_buffers77 system.ruby.network.ext_links0.int_node.port_buffers78 system.ruby.network.ext_links0.int_node.port_buffers79 +router_id=0 +virt_nets=10 + +[system.ruby.network.ext_links0.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1] +type=SimpleExtLink +children=int_node +bandwidth_factor=8 +eventq_index=0 +ext_node=system.ruby.cp_cntrl0 +int_node=system.ruby.network.ext_links1.int_node +latency=1 +link_id=1 +weight=1 + +[system.ruby.network.ext_links1.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links1.int_node.port_buffers00 system.ruby.network.ext_links1.int_node.port_buffers01 system.ruby.network.ext_links1.int_node.port_buffers02 system.ruby.network.ext_links1.int_node.port_buffers03 system.ruby.network.ext_links1.int_node.port_buffers04 system.ruby.network.ext_links1.int_node.port_buffers05 system.ruby.network.ext_links1.int_node.port_buffers06 system.ruby.network.ext_links1.int_node.port_buffers07 system.ruby.network.ext_links1.int_node.port_buffers08 system.ruby.network.ext_links1.int_node.port_buffers09 system.ruby.network.ext_links1.int_node.port_buffers10 system.ruby.network.ext_links1.int_node.port_buffers11 system.ruby.network.ext_links1.int_node.port_buffers12 system.ruby.network.ext_links1.int_node.port_buffers13 system.ruby.network.ext_links1.int_node.port_buffers14 system.ruby.network.ext_links1.int_node.port_buffers15 system.ruby.network.ext_links1.int_node.port_buffers16 system.ruby.network.ext_links1.int_node.port_buffers17 system.ruby.network.ext_links1.int_node.port_buffers18 system.ruby.network.ext_links1.int_node.port_buffers19 system.ruby.network.ext_links1.int_node.port_buffers20 system.ruby.network.ext_links1.int_node.port_buffers21 system.ruby.network.ext_links1.int_node.port_buffers22 system.ruby.network.ext_links1.int_node.port_buffers23 system.ruby.network.ext_links1.int_node.port_buffers24 system.ruby.network.ext_links1.int_node.port_buffers25 system.ruby.network.ext_links1.int_node.port_buffers26 system.ruby.network.ext_links1.int_node.port_buffers27 system.ruby.network.ext_links1.int_node.port_buffers28 system.ruby.network.ext_links1.int_node.port_buffers29 system.ruby.network.ext_links1.int_node.port_buffers30 system.ruby.network.ext_links1.int_node.port_buffers31 system.ruby.network.ext_links1.int_node.port_buffers32 system.ruby.network.ext_links1.int_node.port_buffers33 system.ruby.network.ext_links1.int_node.port_buffers34 system.ruby.network.ext_links1.int_node.port_buffers35 system.ruby.network.ext_links1.int_node.port_buffers36 system.ruby.network.ext_links1.int_node.port_buffers37 system.ruby.network.ext_links1.int_node.port_buffers38 system.ruby.network.ext_links1.int_node.port_buffers39 system.ruby.network.ext_links1.int_node.port_buffers40 system.ruby.network.ext_links1.int_node.port_buffers41 system.ruby.network.ext_links1.int_node.port_buffers42 system.ruby.network.ext_links1.int_node.port_buffers43 system.ruby.network.ext_links1.int_node.port_buffers44 system.ruby.network.ext_links1.int_node.port_buffers45 system.ruby.network.ext_links1.int_node.port_buffers46 system.ruby.network.ext_links1.int_node.port_buffers47 system.ruby.network.ext_links1.int_node.port_buffers48 system.ruby.network.ext_links1.int_node.port_buffers49 system.ruby.network.ext_links1.int_node.port_buffers50 system.ruby.network.ext_links1.int_node.port_buffers51 system.ruby.network.ext_links1.int_node.port_buffers52 system.ruby.network.ext_links1.int_node.port_buffers53 system.ruby.network.ext_links1.int_node.port_buffers54 system.ruby.network.ext_links1.int_node.port_buffers55 system.ruby.network.ext_links1.int_node.port_buffers56 system.ruby.network.ext_links1.int_node.port_buffers57 system.ruby.network.ext_links1.int_node.port_buffers58 system.ruby.network.ext_links1.int_node.port_buffers59 system.ruby.network.ext_links1.int_node.port_buffers60 system.ruby.network.ext_links1.int_node.port_buffers61 system.ruby.network.ext_links1.int_node.port_buffers62 system.ruby.network.ext_links1.int_node.port_buffers63 system.ruby.network.ext_links1.int_node.port_buffers64 system.ruby.network.ext_links1.int_node.port_buffers65 system.ruby.network.ext_links1.int_node.port_buffers66 system.ruby.network.ext_links1.int_node.port_buffers67 system.ruby.network.ext_links1.int_node.port_buffers68 system.ruby.network.ext_links1.int_node.port_buffers69 +router_id=1 +virt_nets=10 + +[system.ruby.network.ext_links1.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2] +type=SimpleExtLink +children=int_node +bandwidth_factor=8 +eventq_index=0 +ext_node=system.ruby.tcp_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=2 +weight=1 + +[system.ruby.network.ext_links2.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69 +router_id=2 +virt_nets=10 + +[system.ruby.network.ext_links2.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links3] +type=SimpleExtLink +bandwidth_factor=8 +eventq_index=0 +ext_node=system.ruby.tcp_cntrl1 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=3 +weight=1 + +[system.ruby.network.ext_links4] +type=SimpleExtLink +bandwidth_factor=8 +eventq_index=0 +ext_node=system.ruby.sqc_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=4 +weight=1 + +[system.ruby.network.ext_links5] +type=SimpleExtLink +bandwidth_factor=8 +eventq_index=0 +ext_node=system.ruby.tcc_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=5 +weight=1 + +[system.ruby.network.int_link_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_links0] +type=SimpleIntLink +bandwidth_factor=8 +eventq_index=0 +latency=1 +link_id=0 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links1.int_node +weight=1 + +[system.ruby.network.int_links1] +type=SimpleIntLink +bandwidth_factor=8 +eventq_index=0 +latency=1 +link_id=1 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links2.int_node +weight=1 + +[system.ruby.phys_mem] +type=SimpleMemory +bandwidth=73.000000 +clk_domain=system.ruby.clk_domain +conf_table_reported=true +eventq_index=0 +in_addr_map=false +latency=30000 +latency_var=0 +null=false +range=0:536870911 + +[system.ruby.sqc_cntrl0] +type=SQC_Controller +children=L1cache mandatoryQueue probeToSQC requestFromSQC responseToSQC sequencer +L1cache=system.ruby.sqc_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.ruby.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=80 +l2_hit_latency=18 +mandatoryQueue=system.ruby.sqc_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToSQC=system.ruby.sqc_cntrl0.probeToSQC +recycle_latency=10 +requestFromSQC=system.ruby.sqc_cntrl0.requestFromSQC +responseToSQC=system.ruby.sqc_cntrl0.responseToSQC +ruby_system=system.ruby +sequencer=system.ruby.sqc_cntrl0.sequencer +system=system +transitions_per_cycle=32 +version=0 + +[system.ruby.sqc_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=8 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.sqc_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=8 + +[system.ruby.sqc_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=32768 + +[system.ruby.sqc_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.ruby.sqc_cntrl0.probeToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[9] + +[system.ruby.sqc_cntrl0.requestFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[11] + +[system.ruby.sqc_cntrl0.responseToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[10] + +[system.ruby.sqc_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.ruby.clk_domain +coreid=99 +dcache=system.ruby.sqc_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.sqc_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=false +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=6 +slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port + +[system.ruby.tcc_cntrl0] +type=TCC_Controller +children=L2cache probeFromNB requestFromTCP requestToNB responseFromNB responseToCore responseToNB triggerQueue unblockToNB +L2cache=system.ruby.tcc_cntrl0.L2cache +WB=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +cluster_id=0 +eventq_index=0 +l2_request_latency=120 +l2_response_latency=16 +number_of_TBEs=5120 +probeFromNB=system.ruby.tcc_cntrl0.probeFromNB +recycle_latency=10 +requestFromTCP=system.ruby.tcc_cntrl0.requestFromTCP +requestToNB=system.ruby.tcc_cntrl0.requestToNB +responseFromNB=system.ruby.tcc_cntrl0.responseFromNB +responseToCore=system.ruby.tcc_cntrl0.responseToCore +responseToNB=system.ruby.tcc_cntrl0.responseToNB +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +triggerQueue=system.ruby.tcc_cntrl0.triggerQueue +unblockToNB=system.ruby.tcc_cntrl0.unblockToNB +version=0 + +[system.ruby.tcc_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=8 +dataArrayBanks=256 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.tcc_cntrl0.L2cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=262144 +start_index_bit=6 +tagAccessLatency=2 +tagArrayBanks=256 + +[system.ruby.tcc_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=262144 + +[system.ruby.tcc_cntrl0.probeFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[12] + +[system.ruby.tcc_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[11] + +[system.ruby.tcc_cntrl0.requestToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[13] + +[system.ruby.tcc_cntrl0.responseFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[13] + +[system.ruby.tcc_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[12] + +[system.ruby.tcc_cntrl0.responseToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[14] + +[system.ruby.tcc_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.tcc_cntrl0.unblockToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[15] + +[system.ruby.tcp_cntrl0] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.ruby.tcp_cntrl0.L1cache +TCC_select_num_bits=0 +WB=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +cluster_id=0 +coalescer=system.ruby.tcp_cntrl0.coalescer +disableL1=false +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.ruby.tcp_cntrl0.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.ruby.tcp_cntrl0.probeToTCP +recycle_latency=10 +requestFromTCP=system.ruby.tcp_cntrl0.requestFromTCP +responseFromTCP=system.ruby.tcp_cntrl0.responseFromTCP +responseToTCP=system.ruby.tcp_cntrl0.responseToTCP +ruby_system=system.ruby +sequencer=system.ruby.tcp_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.ruby.tcp_cntrl0.unblockFromCore +use_seq_not_coal=false +version=0 + +[system.ruby.tcp_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.tcp_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=16 + +[system.ruby.tcp_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16384 + +[system.ruby.tcp_cntrl0.coalescer] +type=VIPERCoalescer +assume_rfo=false +clk_domain=system.ruby.clk_domain +coreid=99 +dcache=system.ruby.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_inv_per_cycle=32 +max_outstanding_requests=2560 +max_wb_per_cycle=32 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=2 +slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63] + +[system.ruby.tcp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.ruby.tcp_cntrl0.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[5] + +[system.ruby.tcp_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[5] + +[system.ruby.tcp_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[6] + +[system.ruby.tcp_cntrl0.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[6] + +[system.ruby.tcp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.ruby.clk_domain +coreid=99 +dcache=system.ruby.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=3 + +[system.ruby.tcp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[7] + +[system.ruby.tcp_cntrl1] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.ruby.tcp_cntrl1.L1cache +TCC_select_num_bits=0 +WB=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +cluster_id=0 +coalescer=system.ruby.tcp_cntrl1.coalescer +disableL1=false +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.ruby.tcp_cntrl1.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.ruby.tcp_cntrl1.probeToTCP +recycle_latency=10 +requestFromTCP=system.ruby.tcp_cntrl1.requestFromTCP +responseFromTCP=system.ruby.tcp_cntrl1.responseFromTCP +responseToTCP=system.ruby.tcp_cntrl1.responseToTCP +ruby_system=system.ruby +sequencer=system.ruby.tcp_cntrl1.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.ruby.tcp_cntrl1.unblockFromCore +use_seq_not_coal=false +version=1 + +[system.ruby.tcp_cntrl1.L1cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.ruby.tcp_cntrl1.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=16 + +[system.ruby.tcp_cntrl1.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16384 + +[system.ruby.tcp_cntrl1.coalescer] +type=VIPERCoalescer +assume_rfo=false +clk_domain=system.ruby.clk_domain +coreid=99 +dcache=system.ruby.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_inv_per_cycle=32 +max_outstanding_requests=2560 +max_wb_per_cycle=32 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=4 +slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63] + +[system.ruby.tcp_cntrl1.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.ruby.tcp_cntrl1.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[7] + +[system.ruby.tcp_cntrl1.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[8] + +[system.ruby.tcp_cntrl1.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[9] + +[system.ruby.tcp_cntrl1.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[8] + +[system.ruby.tcp_cntrl1.sequencer] +type=RubySequencer +clk_domain=system.ruby.clk_domain +coreid=99 +dcache=system.ruby.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.ruby.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=5 + +[system.ruby.tcp_cntrl1.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[10] + +[system.sqc_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.sqc_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.sqc_tlb.slave[0] +slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port + +[system.sqc_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain + +[system.sqc_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sqc_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.sqc_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[0] +slave=system.sqc_coalescer.master[0] + +[system.sqc_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_tlb.clk_domain.voltage_domain + +[system.sqc_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sys_port_proxy] +type=RubyPortProxy +clk_domain=system.clk_domain +eventq_index=0 +is_cpu_sequencer=true +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_ruby_tester=false +version=0 +slave=system.system_port + +[system.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simerr new file mode 100755 index 000000000..1e2b8911e --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simerr @@ -0,0 +1,5 @@ +warn: system.ruby.network adopting orphan SimObject param 'int_links' +warn: system.ruby.network adopting orphan SimObject param 'ext_links' +warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes) +warn: Sockets disabled, not accepting gdb connections +warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files! diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simout new file mode 100755 index 000000000..3b7ae46db --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simout @@ -0,0 +1,21 @@ +gem5 Simulator System. http://gem5.org +gem5 is copyrighted software; use the --copyright option for details. + +gem5 compiled Jan 19 2016 13:36:44 +gem5 started Jan 19 2016 13:37:09 +gem5 executing on zizzer, pid 49676 +command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER + +Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +Global frequency set at 1000000000000 ticks per second +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +info: Entering event queue @ 0. Starting simulation... +keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23 +the gpu says: +elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe +Exiting @ tick 314399500 because target called exit() diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/stats.txt new file mode 100644 index 000000000..7e23ea73c --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/stats.txt @@ -0,0 +1,3201 @@ + +---------- Begin Simulation Statistics ---------- +sim_seconds 0.000314 # Number of seconds simulated +sim_ticks 314399500 # Number of ticks simulated +final_tick 314399500 # Number of ticks from beginning of simulation (restored from checkpoints and never reset) +sim_freq 1000000000000 # Frequency of simulated ticks +host_inst_rate 59851 # Simulator instruction rate (inst/s) +host_op_rate 123077 # Simulator op (including micro ops) rate (op/s) +host_tick_rate 280996968 # Simulator tick rate (ticks/s) +host_mem_usage 1296852 # Number of bytes of host memory used +host_seconds 1.12 # Real time elapsed on the host +sim_insts 66963 # Number of instructions simulated +sim_ops 137705 # Number of ops (including micro ops) simulated +system.voltage_domain.voltage 1 # Voltage in Volts +system.clk_domain.clock 1000 # Clock period in ticks +system.mem_ctrls.bytes_read::ruby.dir_cntrl0 99840 # Number of bytes read from this memory +system.mem_ctrls.bytes_read::total 99840 # Number of bytes read from this memory +system.mem_ctrls.num_reads::ruby.dir_cntrl0 1560 # Number of read requests responded to by this memory +system.mem_ctrls.num_reads::total 1560 # Number of read requests responded to by this memory +system.mem_ctrls.bw_read::ruby.dir_cntrl0 317557757 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_read::total 317557757 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_total::ruby.dir_cntrl0 317557757 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.bw_total::total 317557757 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.readReqs 1560 # Number of read requests accepted +system.mem_ctrls.writeReqs 0 # Number of write requests accepted +system.mem_ctrls.readBursts 1560 # Number of DRAM read bursts, including those serviced by the write queue +system.mem_ctrls.writeBursts 0 # Number of DRAM write bursts, including those merged in the write queue +system.mem_ctrls.bytesReadDRAM 99840 # Total number of bytes read from DRAM +system.mem_ctrls.bytesReadWrQ 0 # Total number of bytes read from write queue +system.mem_ctrls.bytesWritten 0 # Total number of bytes written to DRAM +system.mem_ctrls.bytesReadSys 99840 # Total read bytes from the system interface side +system.mem_ctrls.bytesWrittenSys 0 # Total written bytes from the system interface side +system.mem_ctrls.servicedByWrQ 0 # Number of DRAM read bursts serviced by the write queue +system.mem_ctrls.mergedWrBursts 0 # Number of DRAM write bursts merged with an existing one +system.mem_ctrls.neitherReadNorWriteReqs 0 # Number of requests that are neither read nor write +system.mem_ctrls.perBankRdBursts::0 122 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::1 192 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::2 93 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::3 44 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::4 61 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::5 79 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::6 52 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::7 42 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::8 54 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::9 56 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::10 182 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::11 90 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::12 223 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::13 125 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::14 51 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::15 94 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::0 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::1 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::2 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::3 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::4 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::5 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::6 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::7 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::8 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::9 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::10 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::11 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::12 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::13 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::14 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::15 0 # Per bank write bursts +system.mem_ctrls.numRdRetry 0 # Number of times read queue was full causing retry +system.mem_ctrls.numWrRetry 0 # Number of times write queue was full causing retry +system.mem_ctrls.totGap 314257000 # Total gap between requests +system.mem_ctrls.readPktSize::0 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::1 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::2 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::3 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::4 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::5 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::6 1560 # Read request sizes (log2) +system.mem_ctrls.writePktSize::0 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::1 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::2 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::3 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::4 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::5 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::6 0 # Write request sizes (log2) +system.mem_ctrls.rdQLenPdf::0 1544 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::1 3 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::2 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::3 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::4 4 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::5 3 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::6 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::7 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::8 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::9 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::10 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::11 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::12 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::13 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::14 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::15 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::16 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::17 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::18 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::19 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::20 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::21 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::22 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::23 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::24 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::25 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::26 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::27 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::28 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::29 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::30 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::31 0 # What read queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::0 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::1 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::2 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::3 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::4 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::5 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::6 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::7 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::8 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::9 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::10 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::11 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::12 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::13 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::14 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::15 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::16 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::17 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::18 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::19 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::20 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::21 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::22 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::23 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::24 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::25 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::26 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::27 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::28 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::29 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::30 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::31 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::32 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::33 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::34 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::35 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::36 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::37 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::38 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::39 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::40 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::41 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::42 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::43 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::44 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::45 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::46 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::47 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::48 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::49 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::50 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::51 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::52 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::53 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::54 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::55 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::56 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::57 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::58 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::59 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::60 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::61 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::62 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::63 0 # What write queue length does an incoming req see +system.mem_ctrls.bytesPerActivate::samples 398 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::mean 247.798995 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::gmean 164.777646 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::stdev 248.151006 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::0-127 138 34.67% 34.67% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::128-255 115 28.89% 63.57% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::256-383 55 13.82% 77.39% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::384-511 30 7.54% 84.92% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::512-639 19 4.77% 89.70% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::640-767 13 3.27% 92.96% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::768-895 7 1.76% 94.72% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::896-1023 7 1.76% 96.48% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::1024-1151 14 3.52% 100.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::total 398 # Bytes accessed per row activation +system.mem_ctrls.totQLat 12586250 # Total ticks spent queuing +system.mem_ctrls.totMemAccLat 41836250 # Total ticks spent from burst creation until serviced by the DRAM +system.mem_ctrls.totBusLat 7800000 # Total ticks spent in databus transfers +system.mem_ctrls.avgQLat 8068.11 # Average queueing delay per DRAM burst +system.mem_ctrls.avgBusLat 5000.00 # Average bus latency per DRAM burst +system.mem_ctrls.avgMemAccLat 26818.11 # Average memory access latency per DRAM burst +system.mem_ctrls.avgRdBW 317.56 # Average DRAM read bandwidth in MiByte/s +system.mem_ctrls.avgWrBW 0.00 # Average achieved write bandwidth in MiByte/s +system.mem_ctrls.avgRdBWSys 317.56 # Average system read bandwidth in MiByte/s +system.mem_ctrls.avgWrBWSys 0.00 # Average system write bandwidth in MiByte/s +system.mem_ctrls.peakBW 12800.00 # Theoretical peak bandwidth in MiByte/s +system.mem_ctrls.busUtil 2.48 # Data bus utilization in percentage +system.mem_ctrls.busUtilRead 2.48 # Data bus utilization in percentage for reads +system.mem_ctrls.busUtilWrite 0.00 # Data bus utilization in percentage for writes +system.mem_ctrls.avgRdQLen 1.04 # Average read queue length when enqueuing +system.mem_ctrls.avgWrQLen 0.00 # Average write queue length when enqueuing +system.mem_ctrls.readRowHits 1157 # Number of row buffer hits during reads +system.mem_ctrls.writeRowHits 0 # Number of row buffer hits during writes +system.mem_ctrls.readRowHitRate 74.17 # Row buffer hit rate for reads +system.mem_ctrls.writeRowHitRate nan # Row buffer hit rate for writes +system.mem_ctrls.avgGap 201446.79 # Average gap between requests +system.mem_ctrls.pageHitRate 74.17 # Row buffer hit rate, read and write combined +system.mem_ctrls_0.actEnergy 1141560 # Energy for activate commands per rank (pJ) +system.mem_ctrls_0.preEnergy 622875 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_0.readEnergy 5335200 # Energy for read commands per rank (pJ) +system.mem_ctrls_0.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_0.refreshEnergy 20342400 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_0.actBackEnergy 179243055 # Energy for active background per rank (pJ) +system.mem_ctrls_0.preBackEnergy 29795250 # Energy for precharge background per rank (pJ) +system.mem_ctrls_0.totalEnergy 236480340 # Total energy per rank (pJ) +system.mem_ctrls_0.averagePower 758.654968 # Core power per rank (mW) +system.mem_ctrls_0.memoryStateTime::IDLE 51073000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::REF 10400000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT 252847000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT_PDN 0 # Time in different power states +system.mem_ctrls_1.actEnergy 1867320 # Energy for activate commands per rank (pJ) +system.mem_ctrls_1.preEnergy 1018875 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_1.readEnergy 6684600 # Energy for read commands per rank (pJ) +system.mem_ctrls_1.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_1.refreshEnergy 20342400 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_1.actBackEnergy 198048780 # Energy for active background per rank (pJ) +system.mem_ctrls_1.preBackEnergy 13299000 # Energy for precharge background per rank (pJ) +system.mem_ctrls_1.totalEnergy 241260975 # Total energy per rank (pJ) +system.mem_ctrls_1.averagePower 773.991771 # Core power per rank (mW) +system.mem_ctrls_1.memoryStateTime::IDLE 20941500 # Time in different power states +system.mem_ctrls_1.memoryStateTime::REF 10400000 # Time in different power states +system.mem_ctrls_1.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT 280382250 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT_PDN 0 # Time in different power states +system.ruby.clk_domain.clock 500 # Clock period in ticks +system.ruby.phys_mem.bytes_read::cpu0.inst 696760 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu0.data 119832 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::total 823152 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu0.inst 696760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::total 700760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_written::cpu0.data 72767 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::total 73279 # Number of bytes written to this memory +system.ruby.phys_mem.num_reads::cpu0.inst 87095 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu0.data 16686 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::total 104891 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu0.data 10422 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::total 10934 # Number of write requests responded to by this memory +system.ruby.phys_mem.bw_read::cpu0.inst 2216161285 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu0.data 381145644 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit 10432587 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit 10432587 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::total 2618172103 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu0.inst 2216161285 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit 6361333 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit 6361333 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::total 2228883952 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu0.data 231447569 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit 814251 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit 814251 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::total 233076070 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.inst 2216161285 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.data 612593213 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit 11246837 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit 11246837 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::total 2851248173 # Total bandwidth to/from this memory (bytes/s) +system.cpu0.clk_domain.clock 500 # Clock period in ticks +system.cpu0.apic_clk_domain.clock 8000 # Clock period in ticks +system.cpu0.workload.num_syscalls 21 # Number of system calls +system.cpu0.numCycles 628799 # number of cpu cycles simulated +system.cpu0.numWorkItemsStarted 0 # number of work items this cpu started +system.cpu0.numWorkItemsCompleted 0 # number of work items this cpu completed +system.cpu0.committedInsts 66963 # Number of instructions committed +system.cpu0.committedOps 137705 # Number of ops (including micro ops) committed +system.cpu0.num_int_alu_accesses 136380 # Number of integer alu accesses +system.cpu0.num_fp_alu_accesses 1279 # Number of float alu accesses +system.cpu0.num_func_calls 3196 # number of times a function call or return occured +system.cpu0.num_conditional_control_insts 12151 # number of instructions that are conditional controls +system.cpu0.num_int_insts 136380 # number of integer instructions +system.cpu0.num_fp_insts 1279 # number of float instructions +system.cpu0.num_int_register_reads 257490 # number of times the integer registers were read +system.cpu0.num_int_register_writes 110039 # number of times the integer registers were written +system.cpu0.num_fp_register_reads 1981 # number of times the floating registers were read +system.cpu0.num_fp_register_writes 981 # number of times the floating registers were written +system.cpu0.num_cc_register_reads 78262 # number of times the CC registers were read +system.cpu0.num_cc_register_writes 42183 # number of times the CC registers were written +system.cpu0.num_mem_refs 27198 # number of memory refs +system.cpu0.num_load_insts 16684 # Number of load instructions +system.cpu0.num_store_insts 10514 # Number of store instructions +system.cpu0.num_idle_cycles 8671.003972 # Number of idle cycles +system.cpu0.num_busy_cycles 620127.996028 # Number of busy cycles +system.cpu0.not_idle_fraction 0.986210 # Percentage of non-idle cycles +system.cpu0.idle_fraction 0.013790 # Percentage of idle cycles +system.cpu0.Branches 16199 # Number of branches fetched +system.cpu0.op_class::No_OpClass 615 0.45% 0.45% # Class of executed instruction +system.cpu0.op_class::IntAlu 108791 79.00% 79.45% # Class of executed instruction +system.cpu0.op_class::IntMult 13 0.01% 79.46% # Class of executed instruction +system.cpu0.op_class::IntDiv 138 0.10% 79.56% # Class of executed instruction +system.cpu0.op_class::FloatAdd 950 0.69% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAddAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShift 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShiftAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::MemRead 16684 12.12% 92.36% # Class of executed instruction +system.cpu0.op_class::MemWrite 10514 7.64% 100.00% # Class of executed instruction +system.cpu0.op_class::IprAccess 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::InstPrefetch 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::total 137705 # Class of executed instruction +system.cpu1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.cpu1.clk_domain.clock 1000 # Clock period in ticks +system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies 216 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies 195 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies 194 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies 177 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue 4663 # number of cycles the CU issues nothing +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued 102 # number of cycles the CU issued at least one instruction +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 1993 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 288 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 325 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 248 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM 341 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM 27 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.spc::samples 4765 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::mean 0.029591 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::stdev 0.214321 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::0 4663 97.86% 97.86% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::1 65 1.36% 99.22% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::2 35 0.73% 99.96% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::3 2 0.04% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::total 4765 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle 66 # number of CU transitions from active to idle +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples 66 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean 61.575758 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev 253.572448 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4 45 68.18% 68.18% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9 10 15.15% 83.33% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14 0 0.00% 83.33% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19 1 1.52% 84.85% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24 2 3.03% 87.88% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29 1 1.52% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75 0 0.00% 89.39% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows 7 10.61% 100.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value 1685 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total 66 # duration of idle periods in cycles +system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs0.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs0.tlb_cycles -212991640500 # total number of cycles for all uncoalesced requests +system.cpu1.CUs0.avg_translation_latency -276972224.317295 # Avg. translation latency for data translations +system.cpu1.CUs0.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.lds_bank_access_cnt 54 # Total number of LDS bank accesses +system.cpu1.CUs0.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::mean 8 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::stdev 6.196773 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::10-11 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::12-13 4 66.67% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs0.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs0.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs0.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs0.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::mean 81.602837 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::stdev 244.924445 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::4-5 57 40.43% 49.65% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::6-7 28 19.86% 69.50% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::8-9 2 1.42% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::10 1 0.71% 71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::overflows 40 28.37% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::max_value 1686 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.num_vec_ops_executed 6769 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs0.num_total_cycles 4765 # number of cycles the CU ran for +system.cpu1.CUs0.vpc 1.420567 # Vector Operations per cycle (this CU only) +system.cpu1.CUs0.ipc 0.029591 # Instructions per cycle (this CU only) +system.cpu1.CUs0.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::mean 48.007092 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::stdev 23.719942 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::9-12 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::13-16 36 25.53% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::mean 37.833333 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::stdev 27.064737 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::9-12 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::13-16 8 44.44% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::mean 19.500000 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::stdev 22.322634 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::9-12 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::13-16 4 66.67% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs0.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs0.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs0.num_completed_wfs 4 # number of completed wavefronts +system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies 216 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies 195 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies 190 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies 176 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue 4667 # number of cycles the CU issues nothing +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued 98 # number of cycles the CU issued at least one instruction +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 2052 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 327 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 265 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 285 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM 341 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM 32 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.spc::samples 4765 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::mean 0.029591 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::stdev 0.218204 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::0 4667 97.94% 97.94% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::1 57 1.20% 99.14% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::2 39 0.82% 99.96% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::3 2 0.04% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::total 4765 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle 68 # number of CU transitions from active to idle +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples 68 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean 61 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev 257.808908 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4 49 72.06% 72.06% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9 8 11.76% 83.82% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14 0 0.00% 83.82% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19 2 2.94% 86.76% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24 1 1.47% 88.24% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29 1 1.47% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows 7 10.29% 100.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value 1764 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total 68 # duration of idle periods in cycles +system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs1.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs1.tlb_cycles -212991830500 # total number of cycles for all uncoalesced requests +system.cpu1.CUs1.avg_translation_latency -276972471.391417 # Avg. translation latency for data translations +system.cpu1.CUs1.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.lds_bank_access_cnt 53 # Total number of LDS bank accesses +system.cpu1.CUs1.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::mean 7.833333 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::stdev 6.080022 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::10-11 1 16.67% 50.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::12-13 3 50.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs1.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs1.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs1.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs1.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::mean 82.212766 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::stdev 248.914352 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::4-5 53 37.59% 46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::6-7 28 19.86% 66.67% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::8-9 5 3.55% 70.21% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::10 1 0.71% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::overflows 41 29.08% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::max_value 1765 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.num_vec_ops_executed 6762 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs1.num_total_cycles 4765 # number of cycles the CU ran for +system.cpu1.CUs1.vpc 1.419098 # Vector Operations per cycle (this CU only) +system.cpu1.CUs1.ipc 0.029591 # Instructions per cycle (this CU only) +system.cpu1.CUs1.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::mean 47.957447 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::stdev 23.818022 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::9-12 9 6.38% 9.93% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::13-16 27 19.15% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::mean 37.722222 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::stdev 27.174394 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::9-12 2 11.11% 16.67% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::13-16 6 33.33% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::mean 19.333333 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::stdev 22.384518 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::9-12 1 16.67% 33.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::13-16 3 50.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs1.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs1.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs1.num_completed_wfs 4 # number of completed wavefronts +system.cpu2.num_kernel_launched 1 # number of kernel launched +system.dispatcher_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_coalescer.uncoalesced_accesses 0 # Number of uncoalesced TLB accesses +system.dispatcher_coalescer.coalesced_accesses 0 # Number of coalesced TLB accesses +system.dispatcher_coalescer.queuing_cycles 0 # Number of cycles spent in queue +system.dispatcher_coalescer.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_coalescer.local_latency nan # Avg. latency over all incoming pkts +system.dispatcher_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_tlb.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_tlb.local_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.local_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.local_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.local_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.global_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.global_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.global_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.global_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.access_cycles 0 # Cycles spent accessing this TLB level +system.dispatcher_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.dispatcher_tlb.unique_pages 0 # Number of unique pages touched +system.dispatcher_tlb.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_tlb.local_latency nan # Avg. latency over incoming coalesced reqs +system.dispatcher_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_coalescer0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer0.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer0.uncoalesced_accesses 778 # Number of uncoalesced TLB accesses +system.l1_coalescer0.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer0.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer0.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer0.local_latency 0 # Avg. latency over all incoming pkts +system.l1_coalescer1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer1.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer1.uncoalesced_accesses 769 # Number of uncoalesced TLB accesses +system.l1_coalescer1.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer1.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer1.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer1.local_latency 0 # Avg. latency over all incoming pkts +system.l1_tlb0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb0.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb0.local_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.local_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.local_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.local_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.global_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.global_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.global_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.global_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb0.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb0.unique_pages 4 # Number of unique pages touched +system.l1_tlb0.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb0.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb0.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_tlb1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb1.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb1.local_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.local_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.local_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.local_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.global_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.global_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.global_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.global_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb1.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb1.unique_pages 3 # Number of unique pages touched +system.l1_tlb1.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb1.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb1.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l2_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l2_coalescer.uncoalesced_accesses 8 # Number of uncoalesced TLB accesses +system.l2_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l2_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l2_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l2_coalescer.local_latency 125 # Avg. latency over all incoming pkts +system.l2_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l2_tlb.local_TLB_accesses 8 # Number of TLB accesses +system.l2_tlb.local_TLB_hits 3 # Number of TLB hits +system.l2_tlb.local_TLB_misses 5 # Number of TLB misses +system.l2_tlb.local_TLB_miss_rate 62.500000 # TLB miss rate +system.l2_tlb.global_TLB_accesses 15 # Number of TLB accesses +system.l2_tlb.global_TLB_hits 3 # Number of TLB hits +system.l2_tlb.global_TLB_misses 12 # Number of TLB misses +system.l2_tlb.global_TLB_miss_rate 80 # TLB miss rate +system.l2_tlb.access_cycles 552008 # Cycles spent accessing this TLB level +system.l2_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.l2_tlb.unique_pages 5 # Number of unique pages touched +system.l2_tlb.local_cycles 69001 # Number of cycles spent in queue for all incoming reqs +system.l2_tlb.local_latency 8625.125000 # Avg. latency over incoming coalesced reqs +system.l2_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l3_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l3_coalescer.uncoalesced_accesses 5 # Number of uncoalesced TLB accesses +system.l3_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l3_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l3_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l3_coalescer.local_latency 200 # Avg. latency over all incoming pkts +system.l3_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l3_tlb.local_TLB_accesses 5 # Number of TLB accesses +system.l3_tlb.local_TLB_hits 0 # Number of TLB hits +system.l3_tlb.local_TLB_misses 5 # Number of TLB misses +system.l3_tlb.local_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.global_TLB_accesses 12 # Number of TLB accesses +system.l3_tlb.global_TLB_hits 0 # Number of TLB hits +system.l3_tlb.global_TLB_misses 12 # Number of TLB misses +system.l3_tlb.global_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.access_cycles 1200000 # Cycles spent accessing this TLB level +system.l3_tlb.page_table_cycles 6000000 # Cycles spent accessing the page table +system.l3_tlb.unique_pages 5 # Number of unique pages touched +system.l3_tlb.local_cycles 150000 # Number of cycles spent in queue for all incoming reqs +system.l3_tlb.local_latency 30000 # Avg. latency over incoming coalesced reqs +system.l3_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.piobus.trans_dist::WriteReq 94 # Transaction distribution +system.piobus.trans_dist::WriteResp 94 # Transaction distribution +system.piobus.pkt_count_system.ruby.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_count::total 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_size_system.ruby.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.pkt_size::total 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.reqLayer0.occupancy 234500 # Layer occupancy (ticks) +system.piobus.reqLayer0.utilization 0.1 # Layer utilization (%) +system.piobus.respLayer0.occupancy 94000 # Layer occupancy (ticks) +system.piobus.respLayer0.utilization 0.0 # Layer utilization (%) +system.ruby.outstanding_req_hist::bucket_size 1 +system.ruby.outstanding_req_hist::max_bucket 9 +system.ruby.outstanding_req_hist::samples 114203 +system.ruby.outstanding_req_hist::mean 1.000035 +system.ruby.outstanding_req_hist::gmean 1.000024 +system.ruby.outstanding_req_hist::stdev 0.005918 +system.ruby.outstanding_req_hist | 0 0.00% 0.00% | 114199 100.00% 100.00% | 4 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.outstanding_req_hist::total 114203 +system.ruby.latency_hist::bucket_size 128 +system.ruby.latency_hist::max_bucket 1279 +system.ruby.latency_hist::samples 114203 +system.ruby.latency_hist::mean 4.423518 +system.ruby.latency_hist::gmean 1.078765 +system.ruby.latency_hist::stdev 30.010569 +system.ruby.latency_hist | 112668 98.66% 98.66% | 1136 0.99% 99.65% | 372 0.33% 99.98% | 3 0.00% 99.98% | 8 0.01% 99.99% | 14 0.01% 100.00% | 2 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.latency_hist::total 114203 +system.ruby.hit_latency_hist::bucket_size 128 +system.ruby.hit_latency_hist::max_bucket 1279 +system.ruby.hit_latency_hist::samples 1535 +system.ruby.hit_latency_hist::mean 255.015635 +system.ruby.hit_latency_hist::gmean 251.519163 +system.ruby.hit_latency_hist::stdev 57.825523 +system.ruby.hit_latency_hist | 0 0.00% 0.00% | 1136 74.01% 74.01% | 372 24.23% 98.24% | 3 0.20% 98.44% | 8 0.52% 98.96% | 14 0.91% 99.87% | 2 0.13% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.hit_latency_hist::total 1535 +system.ruby.miss_latency_hist::bucket_size 2 +system.ruby.miss_latency_hist::max_bucket 19 +system.ruby.miss_latency_hist::samples 112668 +system.ruby.miss_latency_hist::mean 1.009426 +system.ruby.miss_latency_hist::gmean 1.001543 +system.ruby.miss_latency_hist::stdev 0.411800 +system.ruby.miss_latency_hist | 112609 99.95% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 59 0.05% 100.00% +system.ruby.miss_latency_hist::total 112668 +system.ruby.L1Cache.incomplete_times 112609 +system.ruby.L2Cache.incomplete_times 59 +system.ruby.cp_cntrl0.L1D0cache.demand_hits 0 # Number of cache demand hits +system.ruby.cp_cntrl0.L1D0cache.demand_misses 506 # Number of cache demand misses +system.ruby.cp_cntrl0.L1D0cache.demand_accesses 506 # Number of cache demand accesses +system.ruby.cp_cntrl0.L1D0cache.num_data_array_reads 16155 # number of data array reads +system.ruby.cp_cntrl0.L1D0cache.num_data_array_writes 11985 # number of data array writes +system.ruby.cp_cntrl0.L1D0cache.num_tag_array_reads 27132 # number of tag array reads +system.ruby.cp_cntrl0.L1D0cache.num_tag_array_writes 1584 # number of tag array writes +system.ruby.cp_cntrl0.L1D1cache.demand_hits 0 # Number of cache demand hits +system.ruby.cp_cntrl0.L1D1cache.demand_misses 0 # Number of cache demand misses +system.ruby.cp_cntrl0.L1D1cache.demand_accesses 0 # Number of cache demand accesses +system.ruby.cp_cntrl0.L1Icache.demand_hits 0 # Number of cache demand hits +system.ruby.cp_cntrl0.L1Icache.demand_misses 1088 # Number of cache demand misses +system.ruby.cp_cntrl0.L1Icache.demand_accesses 1088 # Number of cache demand accesses +system.ruby.cp_cntrl0.L1Icache.num_data_array_reads 86007 # number of data array reads +system.ruby.cp_cntrl0.L1Icache.num_data_array_writes 54 # number of data array writes +system.ruby.cp_cntrl0.L1Icache.num_tag_array_reads 87684 # number of tag array reads +system.ruby.cp_cntrl0.L1Icache.num_tag_array_writes 54 # number of tag array writes +system.ruby.cp_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.ruby.cp_cntrl0.L2cache.demand_misses 1535 # Number of cache demand misses +system.ruby.cp_cntrl0.L2cache.demand_accesses 1535 # Number of cache demand accesses +system.ruby.cp_cntrl0.L2cache.num_data_array_reads 120 # number of data array reads +system.ruby.cp_cntrl0.L2cache.num_data_array_writes 11982 # number of data array writes +system.ruby.cp_cntrl0.L2cache.num_tag_array_reads 12068 # number of tag array reads +system.ruby.cp_cntrl0.L2cache.num_tag_array_writes 1658 # number of tag array writes +system.ruby.dir_cntrl0.L3CacheMemory.demand_hits 0 # Number of cache demand hits +system.ruby.dir_cntrl0.L3CacheMemory.demand_misses 0 # Number of cache demand misses +system.ruby.dir_cntrl0.L3CacheMemory.demand_accesses 0 # Number of cache demand accesses +system.ruby.dir_cntrl0.L3CacheMemory.num_data_array_writes 1560 # number of data array writes +system.ruby.dir_cntrl0.L3CacheMemory.num_tag_array_reads 1560 # number of tag array reads +system.ruby.dir_cntrl0.L3CacheMemory.num_tag_array_writes 1578 # number of tag array writes +system.ruby.network.ext_links0.int_node.percent_links_utilized 1.075754 +system.ruby.network.ext_links0.int_node.msg_count.Control::0 1560 +system.ruby.network.ext_links0.int_node.msg_count.Data::0 18 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0 1542 +system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2 1546 +system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2 1558 +system.ruby.network.ext_links0.int_node.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4 1541 +system.ruby.network.ext_links0.int_node.msg_bytes.Control::0 12480 +system.ruby.network.ext_links0.int_node.msg_bytes.Data::0 1296 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0 12336 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2 111312 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2 12464 +system.ruby.network.ext_links0.int_node.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4 12328 +system.ruby.network.ext_links1.int_node.percent_links_utilized 1.347807 +system.ruby.network.ext_links1.int_node.msg_count.Control::0 25 +system.ruby.network.ext_links1.int_node.msg_count.Request_Control::0 1535 +system.ruby.network.ext_links1.int_node.msg_count.Response_Data::2 1537 +system.ruby.network.ext_links1.int_node.msg_count.Response_Control::2 23 +system.ruby.network.ext_links1.int_node.msg_count.Unblock_Control::4 1534 +system.ruby.network.ext_links1.int_node.msg_bytes.Control::0 200 +system.ruby.network.ext_links1.int_node.msg_bytes.Request_Control::0 12280 +system.ruby.network.ext_links1.int_node.msg_bytes.Response_Data::2 110664 +system.ruby.network.ext_links1.int_node.msg_bytes.Response_Control::2 184 +system.ruby.network.ext_links1.int_node.msg_bytes.Unblock_Control::4 12272 +system.ruby.tcp_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.ruby.tcp_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.ruby.tcp_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.ruby.tcp_cntrl0.L1cache.num_data_array_reads 6 # number of data array reads +system.ruby.tcp_cntrl0.L1cache.num_data_array_writes 11 # number of data array writes +system.ruby.tcp_cntrl0.L1cache.num_tag_array_reads 1297 # number of tag array reads +system.ruby.tcp_cntrl0.L1cache.num_tag_array_writes 11 # number of tag array writes +system.ruby.tcp_cntrl0.L1cache.num_tag_array_stalls 5082 # number of stalls caused by tag array +system.ruby.tcp_cntrl0.L1cache.num_data_array_stalls 6 # number of stalls caused by data array +system.ruby.tcp_cntrl0.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.ruby.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.ruby.tcp_cntrl0.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.ruby.tcp_cntrl0.coalescer.gpu_ld_misses 5 # loads that miss in the GPU +system.ruby.tcp_cntrl0.coalescer.gpu_tcp_st_hits 0 # stores that hit in the TCP +system.ruby.tcp_cntrl0.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.ruby.tcp_cntrl0.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.ruby.tcp_cntrl0.coalescer.gpu_st_misses 9 # stores that miss in the GPU +system.ruby.tcp_cntrl0.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.ruby.tcp_cntrl0.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.ruby.tcp_cntrl0.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.ruby.tcp_cntrl0.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.ruby.tcp_cntrl0.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.ruby.tcp_cntrl0.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.ruby.tcp_cntrl0.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.ruby.tcp_cntrl0.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.ruby.network.ext_links2.int_node.percent_links_utilized 0.115426 +system.ruby.network.ext_links2.int_node.msg_count.Control::0 1535 +system.ruby.network.ext_links2.int_node.msg_count.Data::0 18 +system.ruby.network.ext_links2.int_node.msg_count.Data::1 18 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0 7 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::1 9 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2 9 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::3 11 +system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2 1535 +system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::3 16 +system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4 7 +system.ruby.network.ext_links2.int_node.msg_bytes.Control::0 12280 +system.ruby.network.ext_links2.int_node.msg_bytes.Data::0 1296 +system.ruby.network.ext_links2.int_node.msg_bytes.Data::1 1296 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0 56 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::1 72 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::3 792 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2 12280 +system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::3 128 +system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4 56 +system.ruby.tcp_cntrl1.L1cache.demand_hits 0 # Number of cache demand hits +system.ruby.tcp_cntrl1.L1cache.demand_misses 0 # Number of cache demand misses +system.ruby.tcp_cntrl1.L1cache.demand_accesses 0 # Number of cache demand accesses +system.ruby.tcp_cntrl1.L1cache.num_data_array_reads 6 # number of data array reads +system.ruby.tcp_cntrl1.L1cache.num_data_array_writes 11 # number of data array writes +system.ruby.tcp_cntrl1.L1cache.num_tag_array_reads 1297 # number of tag array reads +system.ruby.tcp_cntrl1.L1cache.num_tag_array_writes 11 # number of tag array writes +system.ruby.tcp_cntrl1.L1cache.num_tag_array_stalls 5082 # number of stalls caused by tag array +system.ruby.tcp_cntrl1.L1cache.num_data_array_stalls 6 # number of stalls caused by data array +system.ruby.tcp_cntrl1.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.ruby.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.ruby.tcp_cntrl1.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.ruby.tcp_cntrl1.coalescer.gpu_ld_misses 5 # loads that miss in the GPU +system.ruby.tcp_cntrl1.coalescer.gpu_tcp_st_hits 0 # stores that hit in the TCP +system.ruby.tcp_cntrl1.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.ruby.tcp_cntrl1.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.ruby.tcp_cntrl1.coalescer.gpu_st_misses 9 # stores that miss in the GPU +system.ruby.tcp_cntrl1.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.ruby.tcp_cntrl1.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.ruby.tcp_cntrl1.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.ruby.tcp_cntrl1.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.ruby.tcp_cntrl1.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.ruby.tcp_cntrl1.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.ruby.tcp_cntrl1.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.ruby.tcp_cntrl1.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.ruby.sqc_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.ruby.sqc_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.ruby.sqc_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.ruby.sqc_cntrl0.L1cache.num_data_array_reads 86 # number of data array reads +system.ruby.sqc_cntrl0.L1cache.num_tag_array_reads 91 # number of tag array reads +system.ruby.sqc_cntrl0.L1cache.num_tag_array_writes 10 # number of tag array writes +system.ruby.sqc_cntrl0.sequencer.load_waiting_on_load 97 # Number of times a load aliased with a pending load +system.ruby.tcc_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.ruby.tcc_cntrl0.L2cache.demand_misses 0 # Number of cache demand misses +system.ruby.tcc_cntrl0.L2cache.demand_accesses 0 # Number of cache demand accesses +system.ruby.tcc_cntrl0.L2cache.num_data_array_writes 9 # number of data array writes +system.ruby.tcc_cntrl0.L2cache.num_tag_array_reads 1569 # number of tag array reads +system.ruby.tcc_cntrl0.L2cache.num_tag_array_writes 1545 # number of tag array writes +system.ruby.tcc_cntrl0.L2cache.num_tag_array_stalls 1 # number of stalls caused by tag array +system.ruby.network.msg_count.Control 3120 +system.ruby.network.msg_count.Data 54 +system.ruby.network.msg_count.Request_Control 3093 +system.ruby.network.msg_count.Response_Data 3103 +system.ruby.network.msg_count.Response_Control 3116 +system.ruby.network.msg_count.Writeback_Control 48 +system.ruby.network.msg_count.Unblock_Control 3082 +system.ruby.network.msg_byte.Control 24960 +system.ruby.network.msg_byte.Data 3888 +system.ruby.network.msg_byte.Request_Control 24744 +system.ruby.network.msg_byte.Response_Data 223416 +system.ruby.network.msg_byte.Response_Control 24928 +system.ruby.network.msg_byte.Writeback_Control 384 +system.ruby.network.msg_byte.Unblock_Control 24656 +system.sqc_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.sqc_coalescer.uncoalesced_accesses 86 # Number of uncoalesced TLB accesses +system.sqc_coalescer.coalesced_accesses 48 # Number of coalesced TLB accesses +system.sqc_coalescer.queuing_cycles 211000 # Number of cycles spent in queue +system.sqc_coalescer.local_queuing_cycles 211000 # Number of cycles spent in queue for all incoming reqs +system.sqc_coalescer.local_latency 2453.488372 # Avg. latency over all incoming pkts +system.sqc_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_tlb.clk_domain.clock 1000 # Clock period in ticks +system.sqc_tlb.local_TLB_accesses 48 # Number of TLB accesses +system.sqc_tlb.local_TLB_hits 47 # Number of TLB hits +system.sqc_tlb.local_TLB_misses 1 # Number of TLB misses +system.sqc_tlb.local_TLB_miss_rate 2.083333 # TLB miss rate +system.sqc_tlb.global_TLB_accesses 86 # Number of TLB accesses +system.sqc_tlb.global_TLB_hits 78 # Number of TLB hits +system.sqc_tlb.global_TLB_misses 8 # Number of TLB misses +system.sqc_tlb.global_TLB_miss_rate 9.302326 # TLB miss rate +system.sqc_tlb.access_cycles 86008 # Cycles spent accessing this TLB level +system.sqc_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.sqc_tlb.unique_pages 1 # Number of unique pages touched +system.sqc_tlb.local_cycles 48001 # Number of cycles spent in queue for all incoming reqs +system.sqc_tlb.local_latency 1000.020833 # Avg. latency over incoming coalesced reqs +system.sqc_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.ruby.network.ext_links0.int_node.throttle0.link_utilization 0.766700 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Data::0 18 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0 1542 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2 2 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2 1558 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4 1541 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Data::0 1296 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0 12336 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2 144 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2 12464 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4 12328 +system.ruby.network.ext_links0.int_node.throttle1.link_utilization 2.201021 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Control::0 25 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Data::2 1535 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Control::0 200 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Data::2 110520 +system.ruby.network.ext_links0.int_node.throttle2.link_utilization 0.259542 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Control::0 1535 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2 9 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Control::0 12280 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links1.int_node.throttle0.link_utilization 2.201021 +system.ruby.network.ext_links1.int_node.throttle0.msg_count.Control::0 25 +system.ruby.network.ext_links1.int_node.throttle0.msg_count.Response_Data::2 1535 +system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Control::0 200 +system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Response_Data::2 110520 +system.ruby.network.ext_links1.int_node.throttle1.link_utilization 0.494594 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Request_Control::0 1535 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Data::2 2 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Control::2 23 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Unblock_Control::4 1534 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Request_Control::0 12280 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Data::2 144 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Control::2 184 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Unblock_Control::4 12272 +system.ruby.network.ext_links2.int_node.throttle0.link_utilization 0.005566 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::3 3 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Writeback_Control::3 8 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::3 216 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Writeback_Control::3 64 +system.ruby.network.ext_links2.int_node.throttle1.link_utilization 0.005566 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Data::3 3 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Writeback_Control::3 8 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Data::3 216 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Writeback_Control::3 64 +system.ruby.network.ext_links2.int_node.throttle2.link_utilization 0.286737 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Control::0 1535 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Data::1 18 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Request_Control::1 9 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Data::2 9 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Control::0 12280 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Data::1 1296 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Request_Control::1 72 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links2.int_node.throttle3.link_utilization 0.007156 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::3 5 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::3 360 +system.ruby.network.ext_links2.int_node.throttle4.link_utilization 0.272106 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Data::0 18 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Request_Control::0 7 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Response_Control::2 1535 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Unblock_Control::4 7 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Data::0 1296 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Request_Control::0 56 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Response_Control::2 12280 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Unblock_Control::4 56 +system.ruby.LD.latency_hist::bucket_size 128 +system.ruby.LD.latency_hist::max_bucket 1279 +system.ruby.LD.latency_hist::samples 16335 +system.ruby.LD.latency_hist::mean 3.784451 +system.ruby.LD.latency_hist::gmean 1.062267 +system.ruby.LD.latency_hist::stdev 27.056562 +system.ruby.LD.latency_hist | 16160 98.93% 98.93% | 90 0.55% 99.48% | 84 0.51% 99.99% | 0 0.00% 99.99% | 0 0.00% 99.99% | 1 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.latency_hist::total 16335 +system.ruby.LD.hit_latency_hist::bucket_size 128 +system.ruby.LD.hit_latency_hist::max_bucket 1279 +system.ruby.LD.hit_latency_hist::samples 175 +system.ruby.LD.hit_latency_hist::mean 260.394286 +system.ruby.LD.hit_latency_hist::gmean 258.339713 +system.ruby.LD.hit_latency_hist::stdev 42.039376 +system.ruby.LD.hit_latency_hist | 0 0.00% 0.00% | 90 51.43% 51.43% | 84 48.00% 99.43% | 0 0.00% 99.43% | 0 0.00% 99.43% | 1 0.57% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.hit_latency_hist::total 175 +system.ruby.LD.miss_latency_hist::bucket_size 2 +system.ruby.LD.miss_latency_hist::max_bucket 19 +system.ruby.LD.miss_latency_hist::samples 16160 +system.ruby.LD.miss_latency_hist::mean 1.005569 +system.ruby.LD.miss_latency_hist::gmean 1.000911 +system.ruby.LD.miss_latency_hist::stdev 0.316580 +system.ruby.LD.miss_latency_hist | 16155 99.97% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 5 0.03% 100.00% +system.ruby.LD.miss_latency_hist::total 16160 +system.ruby.ST.latency_hist::bucket_size 128 +system.ruby.ST.latency_hist::max_bucket 1279 +system.ruby.ST.latency_hist::samples 10412 +system.ruby.ST.latency_hist::mean 8.839992 +system.ruby.ST.latency_hist::gmean 1.186243 +system.ruby.ST.latency_hist::stdev 45.390081 +system.ruby.ST.latency_hist | 10090 96.91% 96.91% | 254 2.44% 99.35% | 62 0.60% 99.94% | 0 0.00% 99.94% | 1 0.01% 99.95% | 4 0.04% 99.99% | 1 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.latency_hist::total 10412 +system.ruby.ST.hit_latency_hist::bucket_size 128 +system.ruby.ST.hit_latency_hist::max_bucket 1279 +system.ruby.ST.hit_latency_hist::samples 322 +system.ruby.ST.hit_latency_hist::mean 254.509317 +system.ruby.ST.hit_latency_hist::gmean 250.282441 +system.ruby.ST.hit_latency_hist::stdev 65.931487 +system.ruby.ST.hit_latency_hist | 0 0.00% 0.00% | 254 78.88% 78.88% | 62 19.25% 98.14% | 0 0.00% 98.14% | 1 0.31% 98.45% | 4 1.24% 99.69% | 1 0.31% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.hit_latency_hist::total 322 +system.ruby.ST.miss_latency_hist::bucket_size 1 +system.ruby.ST.miss_latency_hist::max_bucket 9 +system.ruby.ST.miss_latency_hist::samples 10090 +system.ruby.ST.miss_latency_hist::mean 1 +system.ruby.ST.miss_latency_hist::gmean 1 +system.ruby.ST.miss_latency_hist | 0 0.00% 0.00% | 10090 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.miss_latency_hist::total 10090 +system.ruby.IFETCH.latency_hist::bucket_size 128 +system.ruby.IFETCH.latency_hist::max_bucket 1279 +system.ruby.IFETCH.latency_hist::samples 87095 +system.ruby.IFETCH.latency_hist::mean 4.017395 +system.ruby.IFETCH.latency_hist::gmean 1.069735 +system.ruby.IFETCH.latency_hist::stdev 28.134930 +system.ruby.IFETCH.latency_hist | 86061 98.81% 98.81% | 790 0.91% 99.72% | 224 0.26% 99.98% | 3 0.00% 99.98% | 7 0.01% 99.99% | 9 0.01% 100.00% | 1 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.latency_hist::total 87095 +system.ruby.IFETCH.hit_latency_hist::bucket_size 128 +system.ruby.IFETCH.hit_latency_hist::max_bucket 1279 +system.ruby.IFETCH.hit_latency_hist::samples 1034 +system.ruby.IFETCH.hit_latency_hist::mean 254.218569 +system.ruby.IFETCH.hit_latency_hist::gmean 250.716467 +system.ruby.IFETCH.hit_latency_hist::stdev 57.514968 +system.ruby.IFETCH.hit_latency_hist | 0 0.00% 0.00% | 790 76.40% 76.40% | 224 21.66% 98.07% | 3 0.29% 98.36% | 7 0.68% 99.03% | 9 0.87% 99.90% | 1 0.10% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.hit_latency_hist::total 1034 +system.ruby.IFETCH.miss_latency_hist::bucket_size 2 +system.ruby.IFETCH.miss_latency_hist::max_bucket 19 +system.ruby.IFETCH.miss_latency_hist::samples 86061 +system.ruby.IFETCH.miss_latency_hist::mean 1.011294 +system.ruby.IFETCH.miss_latency_hist::gmean 1.001849 +system.ruby.IFETCH.miss_latency_hist::stdev 0.450747 +system.ruby.IFETCH.miss_latency_hist | 86007 99.94% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 54 0.06% 100.00% +system.ruby.IFETCH.miss_latency_hist::total 86061 +system.ruby.RMW_Read.latency_hist::bucket_size 32 +system.ruby.RMW_Read.latency_hist::max_bucket 319 +system.ruby.RMW_Read.latency_hist::samples 341 +system.ruby.RMW_Read.latency_hist::mean 4.114370 +system.ruby.RMW_Read.latency_hist::gmean 1.067644 +system.ruby.RMW_Read.latency_hist::stdev 28.783090 +system.ruby.RMW_Read.latency_hist | 337 98.83% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 2 0.59% 99.41% | 0 0.00% 99.41% | 2 0.59% 100.00% +system.ruby.RMW_Read.latency_hist::total 341 +system.ruby.RMW_Read.hit_latency_hist::bucket_size 32 +system.ruby.RMW_Read.hit_latency_hist::max_bucket 319 +system.ruby.RMW_Read.hit_latency_hist::samples 4 +system.ruby.RMW_Read.hit_latency_hist::mean 266.500000 +system.ruby.RMW_Read.hit_latency_hist::gmean 265.077347 +system.ruby.RMW_Read.hit_latency_hist::stdev 31.754265 +system.ruby.RMW_Read.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 2 50.00% 50.00% | 0 0.00% 50.00% | 2 50.00% 100.00% +system.ruby.RMW_Read.hit_latency_hist::total 4 +system.ruby.RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.RMW_Read.miss_latency_hist::samples 337 +system.ruby.RMW_Read.miss_latency_hist::mean 1 +system.ruby.RMW_Read.miss_latency_hist::gmean 1 +system.ruby.RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.miss_latency_hist::total 337 +system.ruby.Locked_RMW_Read.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.latency_hist::samples 10 +system.ruby.Locked_RMW_Read.latency_hist::mean 1 +system.ruby.Locked_RMW_Read.latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.latency_hist::total 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::mean 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.miss_latency_hist::total 10 +system.ruby.Locked_RMW_Write.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.latency_hist::samples 10 +system.ruby.Locked_RMW_Write.latency_hist::mean 1 +system.ruby.Locked_RMW_Write.latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.latency_hist::total 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::mean 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.miss_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.miss_latency_hist::total 10 +system.ruby.L1Cache.miss_mach_latency_hist::bucket_size 1 +system.ruby.L1Cache.miss_mach_latency_hist::max_bucket 9 +system.ruby.L1Cache.miss_mach_latency_hist::samples 112609 +system.ruby.L1Cache.miss_mach_latency_hist::mean 1 +system.ruby.L1Cache.miss_mach_latency_hist::gmean 1 +system.ruby.L1Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 112609 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L1Cache.miss_mach_latency_hist::total 112609 +system.ruby.L2Cache.miss_mach_latency_hist::bucket_size 2 +system.ruby.L2Cache.miss_mach_latency_hist::max_bucket 19 +system.ruby.L2Cache.miss_mach_latency_hist::samples 59 +system.ruby.L2Cache.miss_mach_latency_hist::mean 19 +system.ruby.L2Cache.miss_mach_latency_hist::gmean 19.000000 +system.ruby.L2Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 59 100.00% 100.00% +system.ruby.L2Cache.miss_mach_latency_hist::total 59 +system.ruby.Directory.hit_mach_latency_hist::bucket_size 128 +system.ruby.Directory.hit_mach_latency_hist::max_bucket 1279 +system.ruby.Directory.hit_mach_latency_hist::samples 1535 +system.ruby.Directory.hit_mach_latency_hist::mean 255.015635 +system.ruby.Directory.hit_mach_latency_hist::gmean 251.519163 +system.ruby.Directory.hit_mach_latency_hist::stdev 57.825523 +system.ruby.Directory.hit_mach_latency_hist | 0 0.00% 0.00% | 1136 74.01% 74.01% | 372 24.23% 98.24% | 3 0.20% 98.44% | 8 0.52% 98.96% | 14 0.91% 99.87% | 2 0.13% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Directory.hit_mach_latency_hist::total 1535 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples 16155 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 16155 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total 16155 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size 2 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket 19 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples 5 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean 19 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean 19.000000 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 5 100.00% 100.00% +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total 5 +system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size 128 +system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket 1279 +system.ruby.LD.Directory.hit_type_mach_latency_hist::samples 175 +system.ruby.LD.Directory.hit_type_mach_latency_hist::mean 260.394286 +system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean 258.339713 +system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev 42.039376 +system.ruby.LD.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 90 51.43% 51.43% | 84 48.00% 99.43% | 0 0.00% 99.43% | 0 0.00% 99.43% | 1 0.57% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.Directory.hit_type_mach_latency_hist::total 175 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples 10090 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10090 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total 10090 +system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size 128 +system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket 1279 +system.ruby.ST.Directory.hit_type_mach_latency_hist::samples 322 +system.ruby.ST.Directory.hit_type_mach_latency_hist::mean 254.509317 +system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean 250.282441 +system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev 65.931487 +system.ruby.ST.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 254 78.88% 78.88% | 62 19.25% 98.14% | 0 0.00% 98.14% | 1 0.31% 98.45% | 4 1.24% 99.69% | 1 0.31% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.Directory.hit_type_mach_latency_hist::total 322 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples 86007 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 86007 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total 86007 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size 2 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket 19 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples 54 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean 19 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean 19.000000 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 54 100.00% 100.00% +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total 54 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size 128 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket 1279 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples 1034 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean 254.218569 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean 250.716467 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev 57.514968 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 790 76.40% 76.40% | 224 21.66% 98.07% | 3 0.29% 98.36% | 7 0.68% 99.03% | 9 0.87% 99.90% | 1 0.10% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total 1034 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 337 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total 337 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size 32 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket 319 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples 4 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean 266.500000 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean 265.077347 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::stdev 31.754265 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 2 50.00% 50.00% | 0 0.00% 50.00% | 2 50.00% 100.00% +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total 4 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.CorePair_Controller.C0_Load_L1miss 180 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Load_L1hit 16155 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1hit 86007 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1miss 1088 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1hit 10448 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckS 1034 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckM 326 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckE 175 0.00% 0.00% +system.ruby.CorePair_Controller.L1I_Repl 589 0.00% 0.00% +system.ruby.CorePair_Controller.L1D0_Repl 24 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1D0 5 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1I 54 0.00% 0.00% +system.ruby.CorePair_Controller.PrbInvData 18 0.00% 0.00% +system.ruby.CorePair_Controller.PrbShrData 7 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Load_L1miss 175 0.00% 0.00% +system.ruby.CorePair_Controller.I.Ifetch0_L1miss 1034 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbInvData 17 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbShrData 5 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1hit 86007 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1miss 54 0.00% 0.00% +system.ruby.CorePair_Controller.S.L1I_Repl 589 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1hit 3356 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Store_L1hit 46 0.00% 0.00% +system.ruby.CorePair_Controller.E0.L1D0_Repl 16 0.00% 0.00% +system.ruby.CorePair_Controller.E0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.O.C0_Load_L1hit 3 0.00% 0.00% +system.ruby.CorePair_Controller.O.C0_Store_L1hit 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1miss 3 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1hit 12796 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Store_L1hit 10401 0.00% 0.00% +system.ruby.CorePair_Controller.M0.L1D0_Repl 8 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.NB_AckM 325 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckE 175 0.00% 0.00% +system.ruby.CorePair_Controller.Si_F0.L2_to_L1I 54 0.00% 0.00% +system.ruby.CorePair_Controller.O_M0.NB_AckM 1 0.00% 0.00% +system.ruby.CorePair_Controller.S0.NB_AckS 1034 0.00% 0.00% +system.ruby.CorePair_Controller.E0_F.L2_to_L1D0 2 0.00% 0.00% +system.ruby.CorePair_Controller.M0_F.L2_to_L1D0 3 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkS 1034 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkM 326 0.00% 0.00% +system.ruby.Directory_Controller.RdBlk 182 0.00% 0.00% +system.ruby.Directory_Controller.WriteThrough 16 0.00% 0.00% +system.ruby.Directory_Controller.Atomic 3 0.00% 0.00% +system.ruby.Directory_Controller.CPUPrbResp 1560 0.00% 0.00% +system.ruby.Directory_Controller.ProbeAcksComplete 1560 0.00% 0.00% +system.ruby.Directory_Controller.MemData 1560 0.00% 0.00% +system.ruby.Directory_Controller.CoreUnblock 1541 0.00% 0.00% +system.ruby.Directory_Controller.UnblockWriteThrough 18 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkS 1034 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkM 326 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlk 182 0.00% 0.00% +system.ruby.Directory_Controller.U.WriteThrough 16 0.00% 0.00% +system.ruby.Directory_Controller.U.Atomic 2 0.00% 0.00% +system.ruby.Directory_Controller.BS_M.MemData 1034 0.00% 0.00% +system.ruby.Directory_Controller.BM_M.MemData 326 0.00% 0.00% +system.ruby.Directory_Controller.B_M.MemData 175 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.CPUPrbResp 1034 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete 1034 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.Atomic 1 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.CPUPrbResp 326 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete 326 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.MemData 18 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.CPUPrbResp 175 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.ProbeAcksComplete 175 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.MemData 7 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.CPUPrbResp 18 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete 18 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.CPUPrbResp 7 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete 7 0.00% 0.00% +system.ruby.Directory_Controller.B.CoreUnblock 1541 0.00% 0.00% +system.ruby.Directory_Controller.B.UnblockWriteThrough 18 0.00% 0.00% +system.ruby.SQC_Controller.Fetch 86 0.00% 0.00% +system.ruby.SQC_Controller.Data 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Fetch 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Data 5 0.00% 0.00% +system.ruby.SQC_Controller.V.Fetch 81 0.00% 0.00% +system.ruby.TCC_Controller.RdBlk 9 0.00% 0.00% +system.ruby.TCC_Controller.WrVicBlk 16 0.00% 0.00% +system.ruby.TCC_Controller.Atomic 2 0.00% 0.00% +system.ruby.TCC_Controller.AtomicDone 1 0.00% 0.00% +system.ruby.TCC_Controller.Data 9 0.00% 0.00% +system.ruby.TCC_Controller.PrbInv 1535 0.00% 0.00% +system.ruby.TCC_Controller.WBAck 16 0.00% 0.00% +system.ruby.TCC_Controller.V.PrbInv 1 0.00% 0.00% +system.ruby.TCC_Controller.I.RdBlk 7 0.00% 0.00% +system.ruby.TCC_Controller.I.WrVicBlk 16 0.00% 0.00% +system.ruby.TCC_Controller.I.Atomic 1 0.00% 0.00% +system.ruby.TCC_Controller.I.PrbInv 1534 0.00% 0.00% +system.ruby.TCC_Controller.I.WBAck 16 0.00% 0.00% +system.ruby.TCC_Controller.IV.RdBlk 2 0.00% 0.00% +system.ruby.TCC_Controller.IV.Data 7 0.00% 0.00% +system.ruby.TCC_Controller.A.Atomic 1 0.00% 0.00% +system.ruby.TCC_Controller.A.AtomicDone 1 0.00% 0.00% +system.ruby.TCC_Controller.A.Data 2 0.00% 0.00% +system.ruby.TCP_Controller.Load | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.Load::total 10 +system.ruby.TCP_Controller.StoreThrough | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.StoreThrough::total 16 +system.ruby.TCP_Controller.Atomic | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.Atomic::total 2 +system.ruby.TCP_Controller.Flush | 768 50.00% 50.00% | 768 50.00% 100.00% +system.ruby.TCP_Controller.Flush::total 1536 +system.ruby.TCP_Controller.Evict | 512 50.00% 50.00% | 512 50.00% 100.00% +system.ruby.TCP_Controller.Evict::total 1024 +system.ruby.TCP_Controller.TCC_Ack | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.TCC_Ack::total 6 +system.ruby.TCP_Controller.TCC_AckWB | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.TCC_AckWB::total 16 +system.ruby.TCP_Controller.I.Load | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.Load::total 4 +system.ruby.TCP_Controller.I.StoreThrough | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.I.StoreThrough::total 16 +system.ruby.TCP_Controller.I.Atomic | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.I.Atomic::total 2 +system.ruby.TCP_Controller.I.Flush | 766 50.00% 50.00% | 766 50.00% 100.00% +system.ruby.TCP_Controller.I.Flush::total 1532 +system.ruby.TCP_Controller.I.Evict | 510 50.00% 50.00% | 510 50.00% 100.00% +system.ruby.TCP_Controller.I.Evict::total 1020 +system.ruby.TCP_Controller.I.TCC_Ack | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.TCC_Ack::total 4 +system.ruby.TCP_Controller.I.TCC_AckWB | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.I.TCC_AckWB::total 16 +system.ruby.TCP_Controller.V.Load | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.V.Load::total 6 +system.ruby.TCP_Controller.V.Flush | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.V.Flush::total 4 +system.ruby.TCP_Controller.V.Evict | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.V.Evict::total 4 +system.ruby.TCP_Controller.A.TCC_Ack | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.A.TCC_Ack::total 2 + +---------- End Simulation Statistics ---------- diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/config.ini new file mode 100644 index 000000000..b3fabf81b --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/config.ini @@ -0,0 +1,4089 @@ +[root] +type=Root +children=system +eventq_index=0 +full_system=false +sim_quantum=0 +time_sync_enable=false +time_sync_period=100000000000 +time_sync_spin_threshold=100000000 + +[system] +type=System +children=clk_domain cp_cntrl0 cpu0 cpu1 cpu2 dir_cntrl0 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus ruby sqc_cntrl0 sqc_coalescer sqc_tlb sys_port_proxy tcc_cntrl0 tcp_cntrl0 tcp_cntrl1 voltage_domain +boot_osflags=a +cache_line_size=64 +clk_domain=system.clk_domain +eventq_index=0 +exit_on_work_items=false +init_param=0 +kernel= +kernel_addr_check=true +load_addr_mask=1099511627775 +load_offset=0 +mem_mode=timing +mem_ranges=0:536870911 +memories=system.mem_ctrls system.ruby.phys_mem +mmap_using_noreserve=false +multi_thread=false +num_work_ids=16 +readfile= +symbolfile= +work_begin_ckpt_count=0 +work_begin_cpu_id_exit=-1 +work_begin_exit_count=0 +work_cpus_ckpt_count=0 +work_end_ckpt_count=0 +work_end_exit_count=0 +work_item_id=-1 +system_port=system.sys_port_proxy.slave[0] + +[system.clk_domain] +type=SrcClockDomain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cp_cntrl0] +type=CorePair_Controller +children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore +L1D0cache=system.cp_cntrl0.L1D0cache +L1D1cache=system.cp_cntrl0.L1D1cache +L1Icache=system.cp_cntrl0.L1Icache +L2cache=system.cp_cntrl0.L2cache +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=120 +l2_hit_latency=18 +mandatoryQueue=system.cp_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToCore=system.cp_cntrl0.probeToCore +recycle_latency=10 +requestFromCore=system.cp_cntrl0.requestFromCore +responseFromCore=system.cp_cntrl0.responseFromCore +responseToCore=system.cp_cntrl0.responseToCore +ruby_system=system.ruby +send_evictions=true +sequencer=system.cp_cntrl0.sequencer +sequencer1=system.cp_cntrl0.sequencer1 +system=system +transitions_per_cycle=32 +triggerQueue=system.cp_cntrl0.triggerQueue +unblockFromCore=system.cp_cntrl0.unblockFromCore +version=0 + +[system.cp_cntrl0.L1D0cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.cp_cntrl0.L1D0cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.cp_cntrl0.L1D1cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.cp_cntrl0.L1D1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.cp_cntrl0.L1Icache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.cp_cntrl0.L1Icache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=32768 + +[system.cp_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L2cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=2097152 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.cp_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=2097152 + +[system.cp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.cp_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[3] + +[system.cp_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[2] + +[system.cp_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[3] + +[system.cp_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[4] + +[system.cp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=0 +dcache=system.cp_cntrl0.L1D0cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=0 +master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave +mem_master_port=system.piobus.slave[0] +slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master + +[system.cp_cntrl0.sequencer1] +type=RubySequencer +clk_domain=system.clk_domain +coreid=1 +dcache=system.cp_cntrl0.L1D1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=1 + +[system.cp_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.cp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[4] + +[system.cpu0] +type=TimingSimpleCPU +children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload +branchPred=Null +checker=Null +clk_domain=system.cpu0.clk_domain +cpu_id=0 +do_checkpoint_insts=true +do_quiesce=true +do_statistics_insts=true +dtb=system.cpu0.dtb +eventq_index=0 +function_trace=false +function_trace_start=0 +interrupts=system.cpu0.interrupts +isa=system.cpu0.isa +itb=system.cpu0.itb +max_insts_all_threads=0 +max_insts_any_thread=0 +max_loads_all_threads=0 +max_loads_any_thread=0 +numThreads=1 +profile=0 +progress_interval=0 +simpoint_start_insts= +socket_id=0 +switched_out=false +system=system +tracer=system.cpu0.tracer +workload=system.cpu0.workload +dcache_port=system.cp_cntrl0.sequencer.slave[1] +icache_port=system.cp_cntrl0.sequencer.slave[0] + +[system.cpu0.apic_clk_domain] +type=DerivedClockDomain +clk_divider=16 +clk_domain=system.cpu0.clk_domain +eventq_index=0 + +[system.cpu0.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cpu0.dtb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.dtb.walker + +[system.cpu0.dtb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.cp_cntrl0.sequencer.slave[3] + +[system.cpu0.interrupts] +type=X86LocalApic +clk_domain=system.cpu0.apic_clk_domain +eventq_index=0 +int_latency=1000 +pio_addr=2305843009213693952 +pio_latency=100000 +system=system +int_master=system.cp_cntrl0.sequencer.slave[4] +int_slave=system.cp_cntrl0.sequencer.master[1] +pio=system.cp_cntrl0.sequencer.master[0] + +[system.cpu0.isa] +type=X86ISA +eventq_index=0 + +[system.cpu0.itb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.itb.walker + +[system.cpu0.itb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.cp_cntrl0.sequencer.slave[2] + +[system.cpu0.tracer] +type=ExeTracer +eventq_index=0 + +[system.cpu0.workload] +type=LiveProcess +cmd=gpu-hello +cwd= +drivers=system.cpu2.cl_driver +egid=100 +env= +errout=cerr +euid=100 +eventq_index=0 +executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello +gid=100 +input=cin +kvmInSE=false +max_stack_size=67108864 +output=cout +pid=100 +ppid=99 +simpoint=0 +system=system +uid=100 +useArchPT=false + +[system.cpu1] +type=Shader +children=CUs0 CUs1 clk_domain +CUs=system.cpu1.CUs0 system.cpu1.CUs1 +clk_domain=system.cpu1.clk_domain +cpu_pointer=system.cpu0 +eventq_index=0 +globalmem=65536 +impl_kern_boundary_sync=true +n_wf=8 +separate_acquire_release=false +timing=true +translation=false + +[system.cpu1.CUs0] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=0 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs0.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs0.ldsBus.slave +memory_port=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl0.coalescer.slave[1] system.tcp_cntrl0.coalescer.slave[2] system.tcp_cntrl0.coalescer.slave[3] system.tcp_cntrl0.coalescer.slave[4] system.tcp_cntrl0.coalescer.slave[5] system.tcp_cntrl0.coalescer.slave[6] system.tcp_cntrl0.coalescer.slave[7] system.tcp_cntrl0.coalescer.slave[8] system.tcp_cntrl0.coalescer.slave[9] system.tcp_cntrl0.coalescer.slave[10] system.tcp_cntrl0.coalescer.slave[11] system.tcp_cntrl0.coalescer.slave[12] system.tcp_cntrl0.coalescer.slave[13] system.tcp_cntrl0.coalescer.slave[14] system.tcp_cntrl0.coalescer.slave[15] system.tcp_cntrl0.coalescer.slave[16] system.tcp_cntrl0.coalescer.slave[17] system.tcp_cntrl0.coalescer.slave[18] system.tcp_cntrl0.coalescer.slave[19] system.tcp_cntrl0.coalescer.slave[20] system.tcp_cntrl0.coalescer.slave[21] system.tcp_cntrl0.coalescer.slave[22] system.tcp_cntrl0.coalescer.slave[23] system.tcp_cntrl0.coalescer.slave[24] system.tcp_cntrl0.coalescer.slave[25] system.tcp_cntrl0.coalescer.slave[26] system.tcp_cntrl0.coalescer.slave[27] system.tcp_cntrl0.coalescer.slave[28] system.tcp_cntrl0.coalescer.slave[29] system.tcp_cntrl0.coalescer.slave[30] system.tcp_cntrl0.coalescer.slave[31] system.tcp_cntrl0.coalescer.slave[32] system.tcp_cntrl0.coalescer.slave[33] system.tcp_cntrl0.coalescer.slave[34] system.tcp_cntrl0.coalescer.slave[35] system.tcp_cntrl0.coalescer.slave[36] system.tcp_cntrl0.coalescer.slave[37] system.tcp_cntrl0.coalescer.slave[38] system.tcp_cntrl0.coalescer.slave[39] system.tcp_cntrl0.coalescer.slave[40] system.tcp_cntrl0.coalescer.slave[41] system.tcp_cntrl0.coalescer.slave[42] system.tcp_cntrl0.coalescer.slave[43] system.tcp_cntrl0.coalescer.slave[44] system.tcp_cntrl0.coalescer.slave[45] system.tcp_cntrl0.coalescer.slave[46] system.tcp_cntrl0.coalescer.slave[47] system.tcp_cntrl0.coalescer.slave[48] system.tcp_cntrl0.coalescer.slave[49] system.tcp_cntrl0.coalescer.slave[50] system.tcp_cntrl0.coalescer.slave[51] system.tcp_cntrl0.coalescer.slave[52] system.tcp_cntrl0.coalescer.slave[53] system.tcp_cntrl0.coalescer.slave[54] system.tcp_cntrl0.coalescer.slave[55] system.tcp_cntrl0.coalescer.slave[56] system.tcp_cntrl0.coalescer.slave[57] system.tcp_cntrl0.coalescer.slave[58] system.tcp_cntrl0.coalescer.slave[59] system.tcp_cntrl0.coalescer.slave[60] system.tcp_cntrl0.coalescer.slave[61] system.tcp_cntrl0.coalescer.slave[62] system.tcp_cntrl0.coalescer.slave[63] +sqc_port=system.sqc_cntrl0.sequencer.slave[0] +sqc_tlb_port=system.sqc_coalescer.slave[0] +translation_port=system.l1_coalescer0.slave[0] + +[system.cpu1.CUs0.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs0.localDataStore.cuPort +slave=system.cpu1.CUs0.ldsPort + +[system.cpu1.CUs0.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs0.ldsBus.master + +[system.cpu1.CUs0.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs0.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs0.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs0.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs0.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.CUs1] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=1 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs1.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs1.ldsBus.slave +memory_port=system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[1] system.tcp_cntrl1.coalescer.slave[2] system.tcp_cntrl1.coalescer.slave[3] system.tcp_cntrl1.coalescer.slave[4] system.tcp_cntrl1.coalescer.slave[5] system.tcp_cntrl1.coalescer.slave[6] system.tcp_cntrl1.coalescer.slave[7] system.tcp_cntrl1.coalescer.slave[8] system.tcp_cntrl1.coalescer.slave[9] system.tcp_cntrl1.coalescer.slave[10] system.tcp_cntrl1.coalescer.slave[11] system.tcp_cntrl1.coalescer.slave[12] system.tcp_cntrl1.coalescer.slave[13] system.tcp_cntrl1.coalescer.slave[14] system.tcp_cntrl1.coalescer.slave[15] system.tcp_cntrl1.coalescer.slave[16] system.tcp_cntrl1.coalescer.slave[17] system.tcp_cntrl1.coalescer.slave[18] system.tcp_cntrl1.coalescer.slave[19] system.tcp_cntrl1.coalescer.slave[20] system.tcp_cntrl1.coalescer.slave[21] system.tcp_cntrl1.coalescer.slave[22] system.tcp_cntrl1.coalescer.slave[23] system.tcp_cntrl1.coalescer.slave[24] system.tcp_cntrl1.coalescer.slave[25] system.tcp_cntrl1.coalescer.slave[26] system.tcp_cntrl1.coalescer.slave[27] system.tcp_cntrl1.coalescer.slave[28] system.tcp_cntrl1.coalescer.slave[29] system.tcp_cntrl1.coalescer.slave[30] system.tcp_cntrl1.coalescer.slave[31] system.tcp_cntrl1.coalescer.slave[32] system.tcp_cntrl1.coalescer.slave[33] system.tcp_cntrl1.coalescer.slave[34] system.tcp_cntrl1.coalescer.slave[35] system.tcp_cntrl1.coalescer.slave[36] system.tcp_cntrl1.coalescer.slave[37] system.tcp_cntrl1.coalescer.slave[38] system.tcp_cntrl1.coalescer.slave[39] system.tcp_cntrl1.coalescer.slave[40] system.tcp_cntrl1.coalescer.slave[41] system.tcp_cntrl1.coalescer.slave[42] system.tcp_cntrl1.coalescer.slave[43] system.tcp_cntrl1.coalescer.slave[44] system.tcp_cntrl1.coalescer.slave[45] system.tcp_cntrl1.coalescer.slave[46] system.tcp_cntrl1.coalescer.slave[47] system.tcp_cntrl1.coalescer.slave[48] system.tcp_cntrl1.coalescer.slave[49] system.tcp_cntrl1.coalescer.slave[50] system.tcp_cntrl1.coalescer.slave[51] system.tcp_cntrl1.coalescer.slave[52] system.tcp_cntrl1.coalescer.slave[53] system.tcp_cntrl1.coalescer.slave[54] system.tcp_cntrl1.coalescer.slave[55] system.tcp_cntrl1.coalescer.slave[56] system.tcp_cntrl1.coalescer.slave[57] system.tcp_cntrl1.coalescer.slave[58] system.tcp_cntrl1.coalescer.slave[59] system.tcp_cntrl1.coalescer.slave[60] system.tcp_cntrl1.coalescer.slave[61] system.tcp_cntrl1.coalescer.slave[62] system.tcp_cntrl1.coalescer.slave[63] +sqc_port=system.sqc_cntrl0.sequencer.slave[1] +sqc_tlb_port=system.sqc_coalescer.slave[1] +translation_port=system.l1_coalescer1.slave[0] + +[system.cpu1.CUs1.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs1.localDataStore.cuPort +slave=system.cpu1.CUs1.ldsPort + +[system.cpu1.CUs1.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs1.ldsBus.master + +[system.cpu1.CUs1.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs1.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs1.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs1.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs1.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.cpu1.clk_domain.voltage_domain + +[system.cpu1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.cpu2] +type=GpuDispatcher +children=cl_driver +cl_driver=system.cpu2.cl_driver +clk_domain=system.clk_domain +cpu=system.cpu0 +eventq_index=0 +pio_addr=8589934592 +pio_latency=1000 +shader_pointer=system.cpu1 +system=system +dma=system.piobus.slave[1] +pio=system.piobus.master[0] +translation_port=system.dispatcher_coalescer.slave[0] + +[system.cpu2.cl_driver] +type=ClDriver +codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +eventq_index=0 +filename=hsa + +[system.dir_cntrl0] +type=Directory_Controller +children=L3CacheMemory L3triggerQueue ProbeFilterMemory directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores +CAB_TCC=false +L3CacheMemory=system.dir_cntrl0.L3CacheMemory +L3triggerQueue=system.dir_cntrl0.L3triggerQueue +ProbeFilterMemory=system.dir_cntrl0.ProbeFilterMemory +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +directory=system.dir_cntrl0.directory +eventq_index=0 +inclusiveDir=true +l3_hit_latency=15 +noTCCdir=true +number_of_TBEs=2560 +probeToCore=system.dir_cntrl0.probeToCore +recycle_latency=10 +requestFromCores=system.dir_cntrl0.requestFromCores +responseFromCores=system.dir_cntrl0.responseFromCores +responseFromMemory=system.dir_cntrl0.responseFromMemory +responseToCore=system.dir_cntrl0.responseToCore +response_latency=30 +ruby_system=system.ruby +system=system +to_memory_controller_latency=1 +transitions_per_cycle=32 +triggerQueue=system.dir_cntrl0.triggerQueue +unblockFromCores=system.dir_cntrl0.unblockFromCores +useL3OnWT=false +version=0 +memory=system.mem_ctrls.port + +[system.dir_cntrl0.L3CacheMemory] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=20 +dataArrayBanks=16.0 +eventq_index=0 +is_icache=false +replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=16777216 +start_index_bit=6 +tagAccessLatency=15 +tagArrayBanks=16.0 + +[system.dir_cntrl0.L3CacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16777216 + +[system.dir_cntrl0.L3triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.ProbeFilterMemory] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=64 +dataAccessLatency=1 +dataArrayBanks=256 +eventq_index=0 +is_icache=false +replacement_policy=system.dir_cntrl0.ProbeFilterMemory.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=1048576 +start_index_bit=6 +tagAccessLatency=8 +tagArrayBanks=8 + +[system.dir_cntrl0.ProbeFilterMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=1048576 + +[system.dir_cntrl0.directory] +type=RubyDirectoryMemory +eventq_index=0 +numa_high_bit=5 +size=536870912 +version=0 + +[system.dir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[0] + +[system.dir_cntrl0.requestFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[0] + +[system.dir_cntrl0.responseFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[1] + +[system.dir_cntrl0.responseFromMemory] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.dir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[1] + +[system.dir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.unblockFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[2] + +[system.dispatcher_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.dispatcher_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.dispatcher_tlb.slave[0] +slave=system.cpu2.translation_port + +[system.dispatcher_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain + +[system.dispatcher_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dispatcher_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.dispatcher_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[1] +slave=system.dispatcher_coalescer.master[0] + +[system.dispatcher_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain + +[system.dispatcher_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dvfs_handler] +type=DVFSHandler +domains= +enable=false +eventq_index=0 +sys_clk_domain=system.clk_domain +transition_latency=100000000 + +[system.l1_coalescer0] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer0.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb0.slave[0] +slave=system.cpu1.CUs0.translation_port[0] + +[system.l1_coalescer0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain + +[system.l1_coalescer0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_coalescer1] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer1.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb1.slave[0] +slave=system.cpu1.CUs1.translation_port[0] + +[system.l1_coalescer1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain + +[system.l1_coalescer1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb0] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb0.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[2] +slave=system.l1_coalescer0.master[0] + +[system.l1_tlb0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb0.clk_domain.voltage_domain + +[system.l1_tlb0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb1] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb1.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[3] +slave=system.l1_coalescer1.master[0] + +[system.l1_tlb1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb1.clk_domain.voltage_domain + +[system.l1_tlb1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l2_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l2_tlb.slave[0] +slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0] + +[system.l2_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_coalescer.clk_domain.voltage_domain + +[system.l2_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l2_tlb.clk_domain +eventq_index=0 +hitLatency=69 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=4096 +master=system.l3_coalescer.slave[0] +slave=system.l2_coalescer.master[0] + +[system.l2_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_tlb.clk_domain.voltage_domain + +[system.l2_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l3_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l3_tlb.slave[0] +slave=system.l2_tlb.master[0] + +[system.l3_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_coalescer.clk_domain.voltage_domain + +[system.l3_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l3_tlb.clk_domain +eventq_index=0 +hitLatency=150 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=8192 +slave=system.l3_coalescer.master[0] + +[system.l3_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_tlb.clk_domain.voltage_domain + +[system.l3_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.mem_ctrls] +type=DRAMCtrl +IDD0=0.075000 +IDD02=0.000000 +IDD2N=0.050000 +IDD2N2=0.000000 +IDD2P0=0.000000 +IDD2P02=0.000000 +IDD2P1=0.000000 +IDD2P12=0.000000 +IDD3N=0.057000 +IDD3N2=0.000000 +IDD3P0=0.000000 +IDD3P02=0.000000 +IDD3P1=0.000000 +IDD3P12=0.000000 +IDD4R=0.187000 +IDD4R2=0.000000 +IDD4W=0.165000 +IDD4W2=0.000000 +IDD5=0.220000 +IDD52=0.000000 +IDD6=0.000000 +IDD62=0.000000 +VDD=1.500000 +VDD2=0.000000 +activation_limit=4 +addr_mapping=RoRaBaCoCh +bank_groups_per_rank=0 +banks_per_rank=8 +burst_length=8 +channels=1 +clk_domain=system.clk_domain +conf_table_reported=true +device_bus_width=8 +device_rowbuffer_size=1024 +device_size=536870912 +devices_per_rank=8 +dll=true +eventq_index=0 +in_addr_map=true +max_accesses_per_row=16 +mem_sched_policy=frfcfs +min_writes_per_switch=16 +null=false +page_policy=open_adaptive +range=0:536870911 +ranks_per_channel=2 +read_buffer_size=32 +static_backend_latency=10000 +static_frontend_latency=10000 +tBURST=5000 +tCCD_L=0 +tCK=1250 +tCL=13750 +tCS=2500 +tRAS=35000 +tRCD=13750 +tREFI=7800000 +tRFC=260000 +tRP=13750 +tRRD=6000 +tRRD_L=0 +tRTP=7500 +tRTW=2500 +tWR=15000 +tWTR=7500 +tXAW=30000 +tXP=0 +tXPDLL=0 +tXS=0 +tXSDLL=0 +write_buffer_size=64 +write_high_thresh_perc=85 +write_low_thresh_perc=50 +port=system.dir_cntrl0.memory + +[system.piobus] +type=NoncoherentXBar +clk_domain=system.clk_domain +eventq_index=0 +forward_latency=0 +frontend_latency=0 +response_latency=0 +use_default_range=false +width=32 +master=system.cpu2.pio +slave=system.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma + +[system.ruby] +type=RubySystem +children=clk_domain network phys_mem +access_backing_store=true +all_instructions=false +block_size_bytes=64 +clk_domain=system.ruby.clk_domain +eventq_index=0 +hot_lines=false +memory_size_bits=48 +num_of_sequencers=5 +number_of_virtual_networks=10 +phys_mem=system.ruby.phys_mem +randomization=false + +[system.ruby.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.ruby.network] +type=SimpleNetwork +children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1 +adaptive_routing=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +control_msg_size=8 +endpoint_bandwidth=1000 +eventq_index=0 +ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5 +int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39 +int_links=system.ruby.network.int_links0 system.ruby.network.int_links1 +netifs= +number_of_virtual_networks=10 +routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links1.int_node system.ruby.network.ext_links2.int_node +ruby_system=system.ruby +topology=Crossbar +master=system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.tcc_cntrl0.requestFromTCP.slave system.tcc_cntrl0.probeFromNB.slave system.tcc_cntrl0.responseFromNB.slave +slave=system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.tcc_cntrl0.responseToCore.master system.tcc_cntrl0.requestToNB.master system.tcc_cntrl0.responseToNB.master system.tcc_cntrl0.unblockToNB.master + +[system.ruby.network.ext_links0] +type=SimpleExtLink +children=int_node +bandwidth_factor=32 +eventq_index=0 +ext_node=system.dir_cntrl0 +int_node=system.ruby.network.ext_links0.int_node +latency=1 +link_id=0 +weight=1 + +[system.ruby.network.ext_links0.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links0.int_node.port_buffers00 system.ruby.network.ext_links0.int_node.port_buffers01 system.ruby.network.ext_links0.int_node.port_buffers02 system.ruby.network.ext_links0.int_node.port_buffers03 system.ruby.network.ext_links0.int_node.port_buffers04 system.ruby.network.ext_links0.int_node.port_buffers05 system.ruby.network.ext_links0.int_node.port_buffers06 system.ruby.network.ext_links0.int_node.port_buffers07 system.ruby.network.ext_links0.int_node.port_buffers08 system.ruby.network.ext_links0.int_node.port_buffers09 system.ruby.network.ext_links0.int_node.port_buffers10 system.ruby.network.ext_links0.int_node.port_buffers11 system.ruby.network.ext_links0.int_node.port_buffers12 system.ruby.network.ext_links0.int_node.port_buffers13 system.ruby.network.ext_links0.int_node.port_buffers14 system.ruby.network.ext_links0.int_node.port_buffers15 system.ruby.network.ext_links0.int_node.port_buffers16 system.ruby.network.ext_links0.int_node.port_buffers17 system.ruby.network.ext_links0.int_node.port_buffers18 system.ruby.network.ext_links0.int_node.port_buffers19 system.ruby.network.ext_links0.int_node.port_buffers20 system.ruby.network.ext_links0.int_node.port_buffers21 system.ruby.network.ext_links0.int_node.port_buffers22 system.ruby.network.ext_links0.int_node.port_buffers23 system.ruby.network.ext_links0.int_node.port_buffers24 system.ruby.network.ext_links0.int_node.port_buffers25 system.ruby.network.ext_links0.int_node.port_buffers26 system.ruby.network.ext_links0.int_node.port_buffers27 system.ruby.network.ext_links0.int_node.port_buffers28 system.ruby.network.ext_links0.int_node.port_buffers29 system.ruby.network.ext_links0.int_node.port_buffers30 system.ruby.network.ext_links0.int_node.port_buffers31 system.ruby.network.ext_links0.int_node.port_buffers32 system.ruby.network.ext_links0.int_node.port_buffers33 system.ruby.network.ext_links0.int_node.port_buffers34 system.ruby.network.ext_links0.int_node.port_buffers35 system.ruby.network.ext_links0.int_node.port_buffers36 system.ruby.network.ext_links0.int_node.port_buffers37 system.ruby.network.ext_links0.int_node.port_buffers38 system.ruby.network.ext_links0.int_node.port_buffers39 system.ruby.network.ext_links0.int_node.port_buffers40 system.ruby.network.ext_links0.int_node.port_buffers41 system.ruby.network.ext_links0.int_node.port_buffers42 system.ruby.network.ext_links0.int_node.port_buffers43 system.ruby.network.ext_links0.int_node.port_buffers44 system.ruby.network.ext_links0.int_node.port_buffers45 system.ruby.network.ext_links0.int_node.port_buffers46 system.ruby.network.ext_links0.int_node.port_buffers47 system.ruby.network.ext_links0.int_node.port_buffers48 system.ruby.network.ext_links0.int_node.port_buffers49 system.ruby.network.ext_links0.int_node.port_buffers50 system.ruby.network.ext_links0.int_node.port_buffers51 system.ruby.network.ext_links0.int_node.port_buffers52 system.ruby.network.ext_links0.int_node.port_buffers53 system.ruby.network.ext_links0.int_node.port_buffers54 system.ruby.network.ext_links0.int_node.port_buffers55 system.ruby.network.ext_links0.int_node.port_buffers56 system.ruby.network.ext_links0.int_node.port_buffers57 system.ruby.network.ext_links0.int_node.port_buffers58 system.ruby.network.ext_links0.int_node.port_buffers59 system.ruby.network.ext_links0.int_node.port_buffers60 system.ruby.network.ext_links0.int_node.port_buffers61 system.ruby.network.ext_links0.int_node.port_buffers62 system.ruby.network.ext_links0.int_node.port_buffers63 system.ruby.network.ext_links0.int_node.port_buffers64 system.ruby.network.ext_links0.int_node.port_buffers65 system.ruby.network.ext_links0.int_node.port_buffers66 system.ruby.network.ext_links0.int_node.port_buffers67 system.ruby.network.ext_links0.int_node.port_buffers68 system.ruby.network.ext_links0.int_node.port_buffers69 system.ruby.network.ext_links0.int_node.port_buffers70 system.ruby.network.ext_links0.int_node.port_buffers71 system.ruby.network.ext_links0.int_node.port_buffers72 system.ruby.network.ext_links0.int_node.port_buffers73 system.ruby.network.ext_links0.int_node.port_buffers74 system.ruby.network.ext_links0.int_node.port_buffers75 system.ruby.network.ext_links0.int_node.port_buffers76 system.ruby.network.ext_links0.int_node.port_buffers77 system.ruby.network.ext_links0.int_node.port_buffers78 system.ruby.network.ext_links0.int_node.port_buffers79 +router_id=0 +virt_nets=10 + +[system.ruby.network.ext_links0.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1] +type=SimpleExtLink +children=int_node +bandwidth_factor=32 +eventq_index=0 +ext_node=system.cp_cntrl0 +int_node=system.ruby.network.ext_links1.int_node +latency=1 +link_id=1 +weight=1 + +[system.ruby.network.ext_links1.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links1.int_node.port_buffers00 system.ruby.network.ext_links1.int_node.port_buffers01 system.ruby.network.ext_links1.int_node.port_buffers02 system.ruby.network.ext_links1.int_node.port_buffers03 system.ruby.network.ext_links1.int_node.port_buffers04 system.ruby.network.ext_links1.int_node.port_buffers05 system.ruby.network.ext_links1.int_node.port_buffers06 system.ruby.network.ext_links1.int_node.port_buffers07 system.ruby.network.ext_links1.int_node.port_buffers08 system.ruby.network.ext_links1.int_node.port_buffers09 system.ruby.network.ext_links1.int_node.port_buffers10 system.ruby.network.ext_links1.int_node.port_buffers11 system.ruby.network.ext_links1.int_node.port_buffers12 system.ruby.network.ext_links1.int_node.port_buffers13 system.ruby.network.ext_links1.int_node.port_buffers14 system.ruby.network.ext_links1.int_node.port_buffers15 system.ruby.network.ext_links1.int_node.port_buffers16 system.ruby.network.ext_links1.int_node.port_buffers17 system.ruby.network.ext_links1.int_node.port_buffers18 system.ruby.network.ext_links1.int_node.port_buffers19 system.ruby.network.ext_links1.int_node.port_buffers20 system.ruby.network.ext_links1.int_node.port_buffers21 system.ruby.network.ext_links1.int_node.port_buffers22 system.ruby.network.ext_links1.int_node.port_buffers23 system.ruby.network.ext_links1.int_node.port_buffers24 system.ruby.network.ext_links1.int_node.port_buffers25 system.ruby.network.ext_links1.int_node.port_buffers26 system.ruby.network.ext_links1.int_node.port_buffers27 system.ruby.network.ext_links1.int_node.port_buffers28 system.ruby.network.ext_links1.int_node.port_buffers29 system.ruby.network.ext_links1.int_node.port_buffers30 system.ruby.network.ext_links1.int_node.port_buffers31 system.ruby.network.ext_links1.int_node.port_buffers32 system.ruby.network.ext_links1.int_node.port_buffers33 system.ruby.network.ext_links1.int_node.port_buffers34 system.ruby.network.ext_links1.int_node.port_buffers35 system.ruby.network.ext_links1.int_node.port_buffers36 system.ruby.network.ext_links1.int_node.port_buffers37 system.ruby.network.ext_links1.int_node.port_buffers38 system.ruby.network.ext_links1.int_node.port_buffers39 system.ruby.network.ext_links1.int_node.port_buffers40 system.ruby.network.ext_links1.int_node.port_buffers41 system.ruby.network.ext_links1.int_node.port_buffers42 system.ruby.network.ext_links1.int_node.port_buffers43 system.ruby.network.ext_links1.int_node.port_buffers44 system.ruby.network.ext_links1.int_node.port_buffers45 system.ruby.network.ext_links1.int_node.port_buffers46 system.ruby.network.ext_links1.int_node.port_buffers47 system.ruby.network.ext_links1.int_node.port_buffers48 system.ruby.network.ext_links1.int_node.port_buffers49 system.ruby.network.ext_links1.int_node.port_buffers50 system.ruby.network.ext_links1.int_node.port_buffers51 system.ruby.network.ext_links1.int_node.port_buffers52 system.ruby.network.ext_links1.int_node.port_buffers53 system.ruby.network.ext_links1.int_node.port_buffers54 system.ruby.network.ext_links1.int_node.port_buffers55 system.ruby.network.ext_links1.int_node.port_buffers56 system.ruby.network.ext_links1.int_node.port_buffers57 system.ruby.network.ext_links1.int_node.port_buffers58 system.ruby.network.ext_links1.int_node.port_buffers59 system.ruby.network.ext_links1.int_node.port_buffers60 system.ruby.network.ext_links1.int_node.port_buffers61 system.ruby.network.ext_links1.int_node.port_buffers62 system.ruby.network.ext_links1.int_node.port_buffers63 system.ruby.network.ext_links1.int_node.port_buffers64 system.ruby.network.ext_links1.int_node.port_buffers65 system.ruby.network.ext_links1.int_node.port_buffers66 system.ruby.network.ext_links1.int_node.port_buffers67 system.ruby.network.ext_links1.int_node.port_buffers68 system.ruby.network.ext_links1.int_node.port_buffers69 +router_id=1 +virt_nets=10 + +[system.ruby.network.ext_links1.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2] +type=SimpleExtLink +children=int_node +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcp_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=2 +weight=1 + +[system.ruby.network.ext_links2.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69 +router_id=2 +virt_nets=10 + +[system.ruby.network.ext_links2.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links3] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcp_cntrl1 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=3 +weight=1 + +[system.ruby.network.ext_links4] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.sqc_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=4 +weight=1 + +[system.ruby.network.ext_links5] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcc_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=5 +weight=1 + +[system.ruby.network.int_link_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_links0] +type=SimpleIntLink +bandwidth_factor=32 +eventq_index=0 +latency=1 +link_id=0 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links1.int_node +weight=1 + +[system.ruby.network.int_links1] +type=SimpleIntLink +bandwidth_factor=32 +eventq_index=0 +latency=1 +link_id=1 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links2.int_node +weight=1 + +[system.ruby.phys_mem] +type=SimpleMemory +bandwidth=73.000000 +clk_domain=system.ruby.clk_domain +conf_table_reported=true +eventq_index=0 +in_addr_map=false +latency=30000 +latency_var=0 +null=false +range=0:536870911 + +[system.sqc_cntrl0] +type=SQC_Controller +children=L1cache mandatoryQueue probeToSQC requestFromSQC responseToSQC sequencer +L1cache=system.sqc_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=80 +l2_hit_latency=18 +mandatoryQueue=system.sqc_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToSQC=system.sqc_cntrl0.probeToSQC +recycle_latency=10 +requestFromSQC=system.sqc_cntrl0.requestFromSQC +responseToSQC=system.sqc_cntrl0.responseToSQC +ruby_system=system.ruby +sequencer=system.sqc_cntrl0.sequencer +system=system +transitions_per_cycle=32 +version=0 + +[system.sqc_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=8 +eventq_index=0 +is_icache=false +replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=8 + +[system.sqc_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=32768 + +[system.sqc_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.sqc_cntrl0.probeToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[9] + +[system.sqc_cntrl0.requestFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[11] + +[system.sqc_cntrl0.responseToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[10] + +[system.sqc_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.sqc_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.sqc_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=false +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=6 +slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port + +[system.sqc_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.sqc_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.sqc_tlb.slave[0] +slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port + +[system.sqc_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain + +[system.sqc_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sqc_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.sqc_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[0] +slave=system.sqc_coalescer.master[0] + +[system.sqc_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_tlb.clk_domain.voltage_domain + +[system.sqc_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sys_port_proxy] +type=RubyPortProxy +clk_domain=system.clk_domain +eventq_index=0 +is_cpu_sequencer=true +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_ruby_tester=false +version=0 +slave=system.system_port + +[system.tcc_cntrl0] +type=TCC_Controller +children=L2cache probeFromNB requestFromTCP requestToNB responseFromNB responseToCore responseToNB triggerQueue unblockToNB +L2cache=system.tcc_cntrl0.L2cache +WB=false +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +l2_request_latency=120 +l2_response_latency=16 +number_of_TBEs=5120 +probeFromNB=system.tcc_cntrl0.probeFromNB +recycle_latency=10 +requestFromTCP=system.tcc_cntrl0.requestFromTCP +requestToNB=system.tcc_cntrl0.requestToNB +responseFromNB=system.tcc_cntrl0.responseFromNB +responseToCore=system.tcc_cntrl0.responseToCore +responseToNB=system.tcc_cntrl0.responseToNB +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +triggerQueue=system.tcc_cntrl0.triggerQueue +unblockToNB=system.tcc_cntrl0.unblockToNB +version=0 + +[system.tcc_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=8 +dataArrayBanks=256 +eventq_index=0 +is_icache=false +replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=2097152 +start_index_bit=6 +tagAccessLatency=2 +tagArrayBanks=256 + +[system.tcc_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=2097152 + +[system.tcc_cntrl0.probeFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[12] + +[system.tcc_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[11] + +[system.tcc_cntrl0.requestToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[13] + +[system.tcc_cntrl0.responseFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[13] + +[system.tcc_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[12] + +[system.tcc_cntrl0.responseToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[14] + +[system.tcc_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.tcc_cntrl0.unblockToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[15] + +[system.tcp_cntrl0] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl0.L1cache +TCC_select_num_bits=0 +WB=false +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl0.coalescer +disableL1=false +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl0.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl0.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl0.requestFromTCP +responseFromTCP=system.tcp_cntrl0.responseFromTCP +responseToTCP=system.tcp_cntrl0.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl0.unblockFromCore +use_seq_not_coal=false +version=0 + +[system.tcp_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.tcp_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl0.coalescer] +type=VIPERCoalescer +assume_rfo=false +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_inv_per_cycle=32 +max_outstanding_requests=2560 +max_wb_per_cycle=32 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=2 +slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63] + +[system.tcp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl0.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[5] + +[system.tcp_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[5] + +[system.tcp_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[6] + +[system.tcp_cntrl0.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[6] + +[system.tcp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=3 + +[system.tcp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[7] + +[system.tcp_cntrl1] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl1.L1cache +TCC_select_num_bits=0 +WB=false +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl1.coalescer +disableL1=false +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl1.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl1.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl1.requestFromTCP +responseFromTCP=system.tcp_cntrl1.responseFromTCP +responseToTCP=system.tcp_cntrl1.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl1.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl1.unblockFromCore +use_seq_not_coal=false +version=1 + +[system.tcp_cntrl1.L1cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.tcp_cntrl1.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl1.coalescer] +type=VIPERCoalescer +assume_rfo=false +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_inv_per_cycle=32 +max_outstanding_requests=2560 +max_wb_per_cycle=32 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=4 +slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63] + +[system.tcp_cntrl1.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl1.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[7] + +[system.tcp_cntrl1.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[8] + +[system.tcp_cntrl1.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[9] + +[system.tcp_cntrl1.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[8] + +[system.tcp_cntrl1.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=5 + +[system.tcp_cntrl1.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[10] + +[system.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simerr new file mode 100755 index 000000000..1e2b8911e --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simerr @@ -0,0 +1,5 @@ +warn: system.ruby.network adopting orphan SimObject param 'int_links' +warn: system.ruby.network adopting orphan SimObject param 'ext_links' +warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes) +warn: Sockets disabled, not accepting gdb connections +warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files! diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simout new file mode 100755 index 000000000..8e68d38e1 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simout @@ -0,0 +1,21 @@ +gem5 Simulator System. http://gem5.org +gem5 is copyrighted software; use the --copyright option for details. + +gem5 compiled Jan 19 2016 13:39:50 +gem5 started Jan 19 2016 13:40:22 +gem5 executing on zizzer, pid 50252 +command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Baseline -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Baseline + +Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +Global frequency set at 1000000000000 ticks per second +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +info: Entering event queue @ 0. Starting simulation... +keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23 +the gpu says: +elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe +Exiting @ tick 548459500 because target called exit() diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/stats.txt new file mode 100644 index 000000000..281a367a9 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/stats.txt @@ -0,0 +1,3200 @@ + +---------- Begin Simulation Statistics ---------- +sim_seconds 0.000548 # Number of seconds simulated +sim_ticks 548459500 # Number of ticks simulated +final_tick 548459500 # Number of ticks from beginning of simulation (restored from checkpoints and never reset) +sim_freq 1000000000000 # Frequency of simulated ticks +host_inst_rate 76623 # Simulator instruction rate (inst/s) +host_op_rate 157567 # Simulator op (including micro ops) rate (op/s) +host_tick_rate 627550839 # Simulator tick rate (ticks/s) +host_mem_usage 1298164 # Number of bytes of host memory used +host_seconds 0.87 # Real time elapsed on the host +sim_insts 66963 # Number of instructions simulated +sim_ops 137705 # Number of ops (including micro ops) simulated +system.voltage_domain.voltage 1 # Voltage in Volts +system.clk_domain.clock 1000 # Clock period in ticks +system.mem_ctrls.bytes_read::dir_cntrl0 99840 # Number of bytes read from this memory +system.mem_ctrls.bytes_read::total 99840 # Number of bytes read from this memory +system.mem_ctrls.num_reads::dir_cntrl0 1560 # Number of read requests responded to by this memory +system.mem_ctrls.num_reads::total 1560 # Number of read requests responded to by this memory +system.mem_ctrls.bw_read::dir_cntrl0 182037142 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_read::total 182037142 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_total::dir_cntrl0 182037142 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.bw_total::total 182037142 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.readReqs 1560 # Number of read requests accepted +system.mem_ctrls.writeReqs 0 # Number of write requests accepted +system.mem_ctrls.readBursts 1560 # Number of DRAM read bursts, including those serviced by the write queue +system.mem_ctrls.writeBursts 0 # Number of DRAM write bursts, including those merged in the write queue +system.mem_ctrls.bytesReadDRAM 99840 # Total number of bytes read from DRAM +system.mem_ctrls.bytesReadWrQ 0 # Total number of bytes read from write queue +system.mem_ctrls.bytesWritten 0 # Total number of bytes written to DRAM +system.mem_ctrls.bytesReadSys 99840 # Total read bytes from the system interface side +system.mem_ctrls.bytesWrittenSys 0 # Total written bytes from the system interface side +system.mem_ctrls.servicedByWrQ 0 # Number of DRAM read bursts serviced by the write queue +system.mem_ctrls.mergedWrBursts 0 # Number of DRAM write bursts merged with an existing one +system.mem_ctrls.neitherReadNorWriteReqs 0 # Number of requests that are neither read nor write +system.mem_ctrls.perBankRdBursts::0 122 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::1 192 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::2 93 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::3 44 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::4 61 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::5 79 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::6 52 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::7 42 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::8 54 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::9 56 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::10 182 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::11 90 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::12 223 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::13 125 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::14 51 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::15 94 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::0 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::1 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::2 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::3 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::4 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::5 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::6 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::7 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::8 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::9 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::10 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::11 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::12 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::13 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::14 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::15 0 # Per bank write bursts +system.mem_ctrls.numRdRetry 0 # Number of times read queue was full causing retry +system.mem_ctrls.numWrRetry 0 # Number of times write queue was full causing retry +system.mem_ctrls.totGap 548231000 # Total gap between requests +system.mem_ctrls.readPktSize::0 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::1 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::2 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::3 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::4 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::5 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::6 1560 # Read request sizes (log2) +system.mem_ctrls.writePktSize::0 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::1 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::2 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::3 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::4 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::5 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::6 0 # Write request sizes (log2) +system.mem_ctrls.rdQLenPdf::0 1545 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::1 3 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::2 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::3 4 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::4 5 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::5 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::6 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::7 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::8 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::9 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::10 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::11 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::12 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::13 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::14 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::15 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::16 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::17 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::18 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::19 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::20 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::21 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::22 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::23 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::24 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::25 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::26 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::27 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::28 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::29 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::30 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::31 0 # What read queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::0 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::1 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::2 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::3 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::4 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::5 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::6 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::7 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::8 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::9 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::10 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::11 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::12 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::13 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::14 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::15 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::16 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::17 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::18 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::19 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::20 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::21 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::22 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::23 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::24 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::25 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::26 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::27 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::28 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::29 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::30 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::31 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::32 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::33 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::34 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::35 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::36 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::37 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::38 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::39 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::40 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::41 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::42 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::43 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::44 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::45 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::46 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::47 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::48 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::49 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::50 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::51 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::52 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::53 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::54 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::55 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::56 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::57 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::58 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::59 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::60 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::61 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::62 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::63 0 # What write queue length does an incoming req see +system.mem_ctrls.bytesPerActivate::samples 467 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::mean 212.008565 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::gmean 148.026325 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::stdev 209.604491 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::0-127 171 36.62% 36.62% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::128-255 154 32.98% 69.59% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::256-383 64 13.70% 83.30% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::384-511 31 6.64% 89.94% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::512-639 16 3.43% 93.36% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::640-767 12 2.57% 95.93% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::768-895 7 1.50% 97.43% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::896-1023 3 0.64% 98.07% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::1024-1151 9 1.93% 100.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::total 467 # Bytes accessed per row activation +system.mem_ctrls.totQLat 15697750 # Total ticks spent queuing +system.mem_ctrls.totMemAccLat 44947750 # Total ticks spent from burst creation until serviced by the DRAM +system.mem_ctrls.totBusLat 7800000 # Total ticks spent in databus transfers +system.mem_ctrls.avgQLat 10062.66 # Average queueing delay per DRAM burst +system.mem_ctrls.avgBusLat 5000.00 # Average bus latency per DRAM burst +system.mem_ctrls.avgMemAccLat 28812.66 # Average memory access latency per DRAM burst +system.mem_ctrls.avgRdBW 182.04 # Average DRAM read bandwidth in MiByte/s +system.mem_ctrls.avgWrBW 0.00 # Average achieved write bandwidth in MiByte/s +system.mem_ctrls.avgRdBWSys 182.04 # Average system read bandwidth in MiByte/s +system.mem_ctrls.avgWrBWSys 0.00 # Average system write bandwidth in MiByte/s +system.mem_ctrls.peakBW 12800.00 # Theoretical peak bandwidth in MiByte/s +system.mem_ctrls.busUtil 1.42 # Data bus utilization in percentage +system.mem_ctrls.busUtilRead 1.42 # Data bus utilization in percentage for reads +system.mem_ctrls.busUtilWrite 0.00 # Data bus utilization in percentage for writes +system.mem_ctrls.avgRdQLen 1.01 # Average read queue length when enqueuing +system.mem_ctrls.avgWrQLen 0.00 # Average write queue length when enqueuing +system.mem_ctrls.readRowHits 1088 # Number of row buffer hits during reads +system.mem_ctrls.writeRowHits 0 # Number of row buffer hits during writes +system.mem_ctrls.readRowHitRate 69.74 # Row buffer hit rate for reads +system.mem_ctrls.writeRowHitRate nan # Row buffer hit rate for writes +system.mem_ctrls.avgGap 351430.13 # Average gap between requests +system.mem_ctrls.pageHitRate 69.74 # Row buffer hit rate, read and write combined +system.mem_ctrls_0.actEnergy 1323000 # Energy for activate commands per rank (pJ) +system.mem_ctrls_0.preEnergy 721875 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_0.readEnergy 5335200 # Energy for read commands per rank (pJ) +system.mem_ctrls_0.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_0.refreshEnergy 35599200 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_0.actBackEnergy 300176820 # Energy for active background per rank (pJ) +system.mem_ctrls_0.preBackEnergy 63865500 # Energy for precharge background per rank (pJ) +system.mem_ctrls_0.totalEnergy 407021595 # Total energy per rank (pJ) +system.mem_ctrls_0.averagePower 746.421165 # Core power per rank (mW) +system.mem_ctrls_0.memoryStateTime::IDLE 107390750 # Time in different power states +system.mem_ctrls_0.memoryStateTime::REF 18200000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT 422764250 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT_PDN 0 # Time in different power states +system.mem_ctrls_1.actEnergy 2207520 # Energy for activate commands per rank (pJ) +system.mem_ctrls_1.preEnergy 1204500 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_1.readEnergy 6731400 # Energy for read commands per rank (pJ) +system.mem_ctrls_1.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_1.refreshEnergy 35599200 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_1.actBackEnergy 328972365 # Energy for active background per rank (pJ) +system.mem_ctrls_1.preBackEnergy 38606250 # Energy for precharge background per rank (pJ) +system.mem_ctrls_1.totalEnergy 413321235 # Total energy per rank (pJ) +system.mem_ctrls_1.averagePower 757.973831 # Core power per rank (mW) +system.mem_ctrls_1.memoryStateTime::IDLE 62414250 # Time in different power states +system.mem_ctrls_1.memoryStateTime::REF 18200000 # Time in different power states +system.mem_ctrls_1.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT 464697000 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT_PDN 0 # Time in different power states +system.ruby.clk_domain.clock 500 # Clock period in ticks +system.ruby.phys_mem.bytes_read::cpu0.inst 696760 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu0.data 119832 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::total 823152 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu0.inst 696760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::total 700760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_written::cpu0.data 72767 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::total 73279 # Number of bytes written to this memory +system.ruby.phys_mem.num_reads::cpu0.inst 87095 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu0.data 16686 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::total 104891 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu0.data 10422 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::total 10934 # Number of write requests responded to by this memory +system.ruby.phys_mem.bw_read::cpu0.inst 1270394623 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu0.data 218488330 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit 5980387 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit 5980387 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::total 1500843727 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu0.inst 1270394623 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit 3646577 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit 3646577 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::total 1277687778 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu0.data 132675248 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit 466762 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit 466762 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::total 133608771 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.inst 1270394623 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.data 351163577 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit 6447149 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit 6447149 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::total 1634452498 # Total bandwidth to/from this memory (bytes/s) +system.ruby.outstanding_req_hist::bucket_size 1 +system.ruby.outstanding_req_hist::max_bucket 9 +system.ruby.outstanding_req_hist::samples 114203 +system.ruby.outstanding_req_hist::mean 1.000035 +system.ruby.outstanding_req_hist::gmean 1.000024 +system.ruby.outstanding_req_hist::stdev 0.005918 +system.ruby.outstanding_req_hist | 0 0.00% 0.00% | 114199 100.00% 100.00% | 4 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.outstanding_req_hist::total 114203 +system.ruby.latency_hist::bucket_size 64 +system.ruby.latency_hist::max_bucket 639 +system.ruby.latency_hist::samples 114203 +system.ruby.latency_hist::mean 3.766924 +system.ruby.latency_hist::gmean 1.075767 +system.ruby.latency_hist::stdev 23.927354 +system.ruby.latency_hist | 112668 98.66% 98.66% | 0 0.00% 98.66% | 0 0.00% 98.66% | 1489 1.30% 99.96% | 10 0.01% 99.97% | 13 0.01% 99.98% | 16 0.01% 99.99% | 7 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.latency_hist::total 114203 +system.ruby.hit_latency_hist::bucket_size 64 +system.ruby.hit_latency_hist::max_bucket 639 +system.ruby.hit_latency_hist::samples 1535 +system.ruby.hit_latency_hist::mean 206.165472 +system.ruby.hit_latency_hist::gmean 204.491657 +system.ruby.hit_latency_hist::stdev 32.551053 +system.ruby.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1489 97.00% 97.00% | 10 0.65% 97.65% | 13 0.85% 98.50% | 16 1.04% 99.54% | 7 0.46% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.hit_latency_hist::total 1535 +system.ruby.miss_latency_hist::bucket_size 2 +system.ruby.miss_latency_hist::max_bucket 19 +system.ruby.miss_latency_hist::samples 112668 +system.ruby.miss_latency_hist::mean 1.009426 +system.ruby.miss_latency_hist::gmean 1.001543 +system.ruby.miss_latency_hist::stdev 0.411800 +system.ruby.miss_latency_hist | 112609 99.95% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 0 0.00% 99.95% | 59 0.05% 100.00% +system.ruby.miss_latency_hist::total 112668 +system.ruby.L1Cache.incomplete_times 112609 +system.ruby.L2Cache.incomplete_times 59 +system.cp_cntrl0.L1D0cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D0cache.demand_misses 506 # Number of cache demand misses +system.cp_cntrl0.L1D0cache.demand_accesses 506 # Number of cache demand accesses +system.cp_cntrl0.L1D0cache.num_data_array_reads 16155 # number of data array reads +system.cp_cntrl0.L1D0cache.num_data_array_writes 11985 # number of data array writes +system.cp_cntrl0.L1D0cache.num_tag_array_reads 27132 # number of tag array reads +system.cp_cntrl0.L1D0cache.num_tag_array_writes 1584 # number of tag array writes +system.cp_cntrl0.L1D1cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D1cache.demand_misses 0 # Number of cache demand misses +system.cp_cntrl0.L1D1cache.demand_accesses 0 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1Icache.demand_misses 1088 # Number of cache demand misses +system.cp_cntrl0.L1Icache.demand_accesses 1088 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.num_data_array_reads 86007 # number of data array reads +system.cp_cntrl0.L1Icache.num_data_array_writes 54 # number of data array writes +system.cp_cntrl0.L1Icache.num_tag_array_reads 87684 # number of tag array reads +system.cp_cntrl0.L1Icache.num_tag_array_writes 54 # number of tag array writes +system.cp_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L2cache.demand_misses 1535 # Number of cache demand misses +system.cp_cntrl0.L2cache.demand_accesses 1535 # Number of cache demand accesses +system.cp_cntrl0.L2cache.num_data_array_reads 120 # number of data array reads +system.cp_cntrl0.L2cache.num_data_array_writes 11982 # number of data array writes +system.cp_cntrl0.L2cache.num_tag_array_reads 12046 # number of tag array reads +system.cp_cntrl0.L2cache.num_tag_array_writes 1641 # number of tag array writes +system.cpu0.clk_domain.clock 500 # Clock period in ticks +system.cpu0.apic_clk_domain.clock 8000 # Clock period in ticks +system.cpu0.workload.num_syscalls 21 # Number of system calls +system.cpu0.numCycles 1096919 # number of cpu cycles simulated +system.cpu0.numWorkItemsStarted 0 # number of work items this cpu started +system.cpu0.numWorkItemsCompleted 0 # number of work items this cpu completed +system.cpu0.committedInsts 66963 # Number of instructions committed +system.cpu0.committedOps 137705 # Number of ops (including micro ops) committed +system.cpu0.num_int_alu_accesses 136380 # Number of integer alu accesses +system.cpu0.num_fp_alu_accesses 1279 # Number of float alu accesses +system.cpu0.num_func_calls 3196 # number of times a function call or return occured +system.cpu0.num_conditional_control_insts 12151 # number of instructions that are conditional controls +system.cpu0.num_int_insts 136380 # number of integer instructions +system.cpu0.num_fp_insts 1279 # number of float instructions +system.cpu0.num_int_register_reads 257490 # number of times the integer registers were read +system.cpu0.num_int_register_writes 110039 # number of times the integer registers were written +system.cpu0.num_fp_register_reads 1981 # number of times the floating registers were read +system.cpu0.num_fp_register_writes 981 # number of times the floating registers were written +system.cpu0.num_cc_register_reads 78262 # number of times the CC registers were read +system.cpu0.num_cc_register_writes 42183 # number of times the CC registers were written +system.cpu0.num_mem_refs 27198 # number of memory refs +system.cpu0.num_load_insts 16684 # Number of load instructions +system.cpu0.num_store_insts 10514 # Number of store instructions +system.cpu0.num_idle_cycles 7577.003986 # Number of idle cycles +system.cpu0.num_busy_cycles 1089341.996014 # Number of busy cycles +system.cpu0.not_idle_fraction 0.993092 # Percentage of non-idle cycles +system.cpu0.idle_fraction 0.006908 # Percentage of idle cycles +system.cpu0.Branches 16199 # Number of branches fetched +system.cpu0.op_class::No_OpClass 615 0.45% 0.45% # Class of executed instruction +system.cpu0.op_class::IntAlu 108791 79.00% 79.45% # Class of executed instruction +system.cpu0.op_class::IntMult 13 0.01% 79.46% # Class of executed instruction +system.cpu0.op_class::IntDiv 138 0.10% 79.56% # Class of executed instruction +system.cpu0.op_class::FloatAdd 950 0.69% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAddAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShift 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShiftAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::MemRead 16684 12.12% 92.36% # Class of executed instruction +system.cpu0.op_class::MemWrite 10514 7.64% 100.00% # Class of executed instruction +system.cpu0.op_class::IprAccess 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::InstPrefetch 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::total 137705 # Class of executed instruction +system.cpu1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.cpu1.clk_domain.clock 1000 # Clock period in ticks +system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies 372 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies 353 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies 344 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies 329 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue 4357 # number of cycles the CU issues nothing +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued 133 # number of cycles the CU issued at least one instruction +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 1547 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 483 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 439 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 403 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM 436 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM 26 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.spc::samples 4490 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::mean 0.031403 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::stdev 0.185563 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::0 4357 97.04% 97.04% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::1 126 2.81% 99.84% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::2 6 0.13% 99.98% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::3 1 0.02% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::total 4490 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle 68 # number of CU transitions from active to idle +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples 68 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean 59.558824 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev 213.072854 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4 48 70.59% 70.59% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9 8 11.76% 82.35% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14 1 1.47% 83.82% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19 1 1.47% 85.29% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24 2 2.94% 88.24% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29 1 1.47% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows 7 10.29% 100.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value 1300 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total 68 # duration of idle periods in cycles +system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs0.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs0.tlb_cycles -373675448000 # total number of cycles for all uncoalesced requests +system.cpu1.CUs0.avg_translation_latency -485923859.557867 # Avg. translation latency for data translations +system.cpu1.CUs0.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.lds_bank_access_cnt 54 # Total number of LDS bank accesses +system.cpu1.CUs0.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::mean 8 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::stdev 6.196773 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::10-11 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::12-13 4 66.67% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs0.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs0.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs0.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs0.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::mean 94.900709 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::stdev 247.493154 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::4-5 53 37.59% 46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::6-7 31 21.99% 68.79% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::8-9 3 2.13% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::10 1 0.71% 71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::overflows 40 28.37% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::max_value 1303 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.num_vec_ops_executed 6769 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs0.num_total_cycles 4490 # number of cycles the CU ran for +system.cpu1.CUs0.vpc 1.507572 # Vector Operations per cycle (this CU only) +system.cpu1.CUs0.ipc 0.031403 # Instructions per cycle (this CU only) +system.cpu1.CUs0.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::mean 48.007092 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::stdev 23.719942 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::9-12 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::13-16 36 25.53% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::mean 37.833333 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::stdev 27.064737 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::9-12 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::13-16 8 44.44% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::mean 19.500000 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::stdev 22.322634 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::9-12 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::13-16 4 66.67% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs0.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs0.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs0.num_completed_wfs 4 # number of completed wavefronts +system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies 377 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies 355 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies 352 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies 337 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue 4359 # number of cycles the CU issues nothing +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued 131 # number of cycles the CU issued at least one instruction +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 1552 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 447 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 464 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 464 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM 426 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM 33 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.spc::samples 4490 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::mean 0.031403 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::stdev 0.189130 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::0 4359 97.08% 97.08% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::1 123 2.74% 99.82% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::2 6 0.13% 99.96% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::3 2 0.04% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::total 4490 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle 74 # number of CU transitions from active to idle +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples 74 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean 55.324324 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev 207.911408 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4 56 75.68% 75.68% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9 7 9.46% 85.14% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14 0 0.00% 85.14% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19 2 2.70% 87.84% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24 1 1.35% 89.19% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29 1 1.35% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows 7 9.46% 100.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value 1304 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total 74 # duration of idle periods in cycles +system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs1.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs1.tlb_cycles -373672588000 # total number of cycles for all uncoalesced requests +system.cpu1.CUs1.avg_translation_latency -485920140.442133 # Avg. translation latency for data translations +system.cpu1.CUs1.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.lds_bank_access_cnt 53 # Total number of LDS bank accesses +system.cpu1.CUs1.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::mean 7.833333 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::stdev 6.080022 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::10-11 1 16.67% 50.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::12-13 3 50.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs1.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs1.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs1.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs1.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::mean 95.106383 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::stdev 249.293307 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::4-5 53 37.59% 46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::6-7 29 20.57% 67.38% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::8-9 5 3.55% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::10 1 0.71% 71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::overflows 40 28.37% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::max_value 1307 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.num_vec_ops_executed 6762 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs1.num_total_cycles 4490 # number of cycles the CU ran for +system.cpu1.CUs1.vpc 1.506013 # Vector Operations per cycle (this CU only) +system.cpu1.CUs1.ipc 0.031403 # Instructions per cycle (this CU only) +system.cpu1.CUs1.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::mean 47.957447 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::stdev 23.818022 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::9-12 9 6.38% 9.93% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::13-16 27 19.15% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::mean 37.722222 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::stdev 27.174394 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::9-12 2 11.11% 16.67% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::13-16 6 33.33% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::mean 19.333333 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::stdev 22.384518 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::9-12 1 16.67% 33.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::13-16 3 50.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs1.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs1.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs1.num_completed_wfs 4 # number of completed wavefronts +system.cpu2.num_kernel_launched 1 # number of kernel launched +system.dir_cntrl0.L3CacheMemory.demand_hits 0 # Number of cache demand hits +system.dir_cntrl0.L3CacheMemory.demand_misses 0 # Number of cache demand misses +system.dir_cntrl0.L3CacheMemory.demand_accesses 0 # Number of cache demand accesses +system.dir_cntrl0.L3CacheMemory.num_data_array_writes 1560 # number of data array writes +system.dir_cntrl0.L3CacheMemory.num_tag_array_reads 1560 # number of tag array reads +system.dir_cntrl0.L3CacheMemory.num_tag_array_writes 1578 # number of tag array writes +system.dir_cntrl0.ProbeFilterMemory.demand_hits 0 # Number of cache demand hits +system.dir_cntrl0.ProbeFilterMemory.demand_misses 0 # Number of cache demand misses +system.dir_cntrl0.ProbeFilterMemory.demand_accesses 0 # Number of cache demand accesses +system.dir_cntrl0.ProbeFilterMemory.num_tag_array_reads 1560 # number of tag array reads +system.dir_cntrl0.ProbeFilterMemory.num_tag_array_writes 1560 # number of tag array writes +system.dispatcher_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_coalescer.uncoalesced_accesses 0 # Number of uncoalesced TLB accesses +system.dispatcher_coalescer.coalesced_accesses 0 # Number of coalesced TLB accesses +system.dispatcher_coalescer.queuing_cycles 0 # Number of cycles spent in queue +system.dispatcher_coalescer.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_coalescer.local_latency nan # Avg. latency over all incoming pkts +system.dispatcher_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_tlb.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_tlb.local_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.local_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.local_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.local_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.global_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.global_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.global_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.global_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.access_cycles 0 # Cycles spent accessing this TLB level +system.dispatcher_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.dispatcher_tlb.unique_pages 0 # Number of unique pages touched +system.dispatcher_tlb.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_tlb.local_latency nan # Avg. latency over incoming coalesced reqs +system.dispatcher_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_coalescer0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer0.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer0.uncoalesced_accesses 778 # Number of uncoalesced TLB accesses +system.l1_coalescer0.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer0.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer0.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer0.local_latency 0 # Avg. latency over all incoming pkts +system.l1_coalescer1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer1.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer1.uncoalesced_accesses 769 # Number of uncoalesced TLB accesses +system.l1_coalescer1.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer1.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer1.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer1.local_latency 0 # Avg. latency over all incoming pkts +system.l1_tlb0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb0.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb0.local_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.local_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.local_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.local_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.global_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.global_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.global_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.global_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb0.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb0.unique_pages 4 # Number of unique pages touched +system.l1_tlb0.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb0.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb0.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_tlb1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb1.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb1.local_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.local_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.local_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.local_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.global_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.global_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.global_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.global_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb1.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb1.unique_pages 3 # Number of unique pages touched +system.l1_tlb1.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb1.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb1.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l2_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l2_coalescer.uncoalesced_accesses 8 # Number of uncoalesced TLB accesses +system.l2_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l2_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l2_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l2_coalescer.local_latency 125 # Avg. latency over all incoming pkts +system.l2_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l2_tlb.local_TLB_accesses 8 # Number of TLB accesses +system.l2_tlb.local_TLB_hits 3 # Number of TLB hits +system.l2_tlb.local_TLB_misses 5 # Number of TLB misses +system.l2_tlb.local_TLB_miss_rate 62.500000 # TLB miss rate +system.l2_tlb.global_TLB_accesses 15 # Number of TLB accesses +system.l2_tlb.global_TLB_hits 3 # Number of TLB hits +system.l2_tlb.global_TLB_misses 12 # Number of TLB misses +system.l2_tlb.global_TLB_miss_rate 80 # TLB miss rate +system.l2_tlb.access_cycles 552008 # Cycles spent accessing this TLB level +system.l2_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.l2_tlb.unique_pages 5 # Number of unique pages touched +system.l2_tlb.local_cycles 69001 # Number of cycles spent in queue for all incoming reqs +system.l2_tlb.local_latency 8625.125000 # Avg. latency over incoming coalesced reqs +system.l2_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l3_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l3_coalescer.uncoalesced_accesses 5 # Number of uncoalesced TLB accesses +system.l3_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l3_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l3_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l3_coalescer.local_latency 200 # Avg. latency over all incoming pkts +system.l3_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l3_tlb.local_TLB_accesses 5 # Number of TLB accesses +system.l3_tlb.local_TLB_hits 0 # Number of TLB hits +system.l3_tlb.local_TLB_misses 5 # Number of TLB misses +system.l3_tlb.local_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.global_TLB_accesses 12 # Number of TLB accesses +system.l3_tlb.global_TLB_hits 0 # Number of TLB hits +system.l3_tlb.global_TLB_misses 12 # Number of TLB misses +system.l3_tlb.global_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.access_cycles 1200000 # Cycles spent accessing this TLB level +system.l3_tlb.page_table_cycles 6000000 # Cycles spent accessing the page table +system.l3_tlb.unique_pages 5 # Number of unique pages touched +system.l3_tlb.local_cycles 150000 # Number of cycles spent in queue for all incoming reqs +system.l3_tlb.local_latency 30000 # Avg. latency over incoming coalesced reqs +system.l3_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.piobus.trans_dist::WriteReq 94 # Transaction distribution +system.piobus.trans_dist::WriteResp 94 # Transaction distribution +system.piobus.pkt_count_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_count::total 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_size_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.pkt_size::total 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.reqLayer0.occupancy 188000 # Layer occupancy (ticks) +system.piobus.reqLayer0.utilization 0.0 # Layer utilization (%) +system.piobus.respLayer0.occupancy 94000 # Layer occupancy (ticks) +system.piobus.respLayer0.utilization 0.0 # Layer utilization (%) +system.ruby.network.ext_links0.int_node.percent_links_utilized 0.130525 +system.ruby.network.ext_links0.int_node.msg_count.Control::0 4 +system.ruby.network.ext_links0.int_node.msg_count.Data::0 18 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0 1542 +system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2 1546 +system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2 2 +system.ruby.network.ext_links0.int_node.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4 1541 +system.ruby.network.ext_links0.int_node.msg_bytes.Control::0 32 +system.ruby.network.ext_links0.int_node.msg_bytes.Data::0 1296 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0 12336 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2 111312 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2 16 +system.ruby.network.ext_links0.int_node.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4 12328 +system.ruby.network.ext_links1.int_node.percent_links_utilized 0.192653 +system.ruby.network.ext_links1.int_node.msg_count.Control::0 3 +system.ruby.network.ext_links1.int_node.msg_count.Request_Control::0 1535 +system.ruby.network.ext_links1.int_node.msg_count.Response_Data::2 1537 +system.ruby.network.ext_links1.int_node.msg_count.Response_Control::2 1 +system.ruby.network.ext_links1.int_node.msg_count.Unblock_Control::4 1534 +system.ruby.network.ext_links1.int_node.msg_bytes.Control::0 24 +system.ruby.network.ext_links1.int_node.msg_bytes.Request_Control::0 12280 +system.ruby.network.ext_links1.int_node.msg_bytes.Response_Data::2 110664 +system.ruby.network.ext_links1.int_node.msg_bytes.Response_Control::2 8 +system.ruby.network.ext_links1.int_node.msg_bytes.Unblock_Control::4 12272 +system.tcp_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl0.L1cache.num_data_array_reads 6 # number of data array reads +system.tcp_cntrl0.L1cache.num_data_array_writes 11 # number of data array writes +system.tcp_cntrl0.L1cache.num_tag_array_reads 1297 # number of tag array reads +system.tcp_cntrl0.L1cache.num_tag_array_writes 11 # number of tag array writes +system.tcp_cntrl0.L1cache.num_tag_array_stalls 1271 # number of stalls caused by tag array +system.tcp_cntrl0.L1cache.num_data_array_stalls 2 # number of stalls caused by data array +system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_ld_misses 5 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.gpu_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_st_misses 9 # stores that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.ruby.network.ext_links2.int_node.percent_links_utilized 0.002557 +system.ruby.network.ext_links2.int_node.msg_count.Control::0 1 +system.ruby.network.ext_links2.int_node.msg_count.Data::0 18 +system.ruby.network.ext_links2.int_node.msg_count.Data::1 18 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0 7 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::1 9 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2 9 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::3 11 +system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2 1 +system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::3 16 +system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4 7 +system.ruby.network.ext_links2.int_node.msg_bytes.Control::0 8 +system.ruby.network.ext_links2.int_node.msg_bytes.Data::0 1296 +system.ruby.network.ext_links2.int_node.msg_bytes.Data::1 1296 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0 56 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::1 72 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::3 792 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2 8 +system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::3 128 +system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4 56 +system.tcp_cntrl1.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl1.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl1.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl1.L1cache.num_data_array_reads 6 # number of data array reads +system.tcp_cntrl1.L1cache.num_data_array_writes 11 # number of data array writes +system.tcp_cntrl1.L1cache.num_tag_array_reads 1297 # number of tag array reads +system.tcp_cntrl1.L1cache.num_tag_array_writes 11 # number of tag array writes +system.tcp_cntrl1.L1cache.num_tag_array_stalls 1271 # number of stalls caused by tag array +system.tcp_cntrl1.L1cache.num_data_array_stalls 2 # number of stalls caused by data array +system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_ld_misses 5 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.gpu_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_st_misses 9 # stores that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.sqc_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.sqc_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.sqc_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.sqc_cntrl0.L1cache.num_data_array_reads 86 # number of data array reads +system.sqc_cntrl0.L1cache.num_tag_array_reads 91 # number of tag array reads +system.sqc_cntrl0.L1cache.num_tag_array_writes 10 # number of tag array writes +system.sqc_cntrl0.sequencer.load_waiting_on_load 98 # Number of times a load aliased with a pending load +system.tcc_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.tcc_cntrl0.L2cache.demand_misses 0 # Number of cache demand misses +system.tcc_cntrl0.L2cache.demand_accesses 0 # Number of cache demand accesses +system.tcc_cntrl0.L2cache.num_data_array_writes 9 # number of data array writes +system.tcc_cntrl0.L2cache.num_tag_array_reads 35 # number of tag array reads +system.tcc_cntrl0.L2cache.num_tag_array_writes 11 # number of tag array writes +system.ruby.network.msg_count.Control 8 +system.ruby.network.msg_count.Data 54 +system.ruby.network.msg_count.Request_Control 3093 +system.ruby.network.msg_count.Response_Data 3103 +system.ruby.network.msg_count.Response_Control 4 +system.ruby.network.msg_count.Writeback_Control 48 +system.ruby.network.msg_count.Unblock_Control 3082 +system.ruby.network.msg_byte.Control 64 +system.ruby.network.msg_byte.Data 3888 +system.ruby.network.msg_byte.Request_Control 24744 +system.ruby.network.msg_byte.Response_Data 223416 +system.ruby.network.msg_byte.Response_Control 32 +system.ruby.network.msg_byte.Writeback_Control 384 +system.ruby.network.msg_byte.Unblock_Control 24656 +system.sqc_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.sqc_coalescer.uncoalesced_accesses 86 # Number of uncoalesced TLB accesses +system.sqc_coalescer.coalesced_accesses 66 # Number of coalesced TLB accesses +system.sqc_coalescer.queuing_cycles 288000 # Number of cycles spent in queue +system.sqc_coalescer.local_queuing_cycles 288000 # Number of cycles spent in queue for all incoming reqs +system.sqc_coalescer.local_latency 3348.837209 # Avg. latency over all incoming pkts +system.sqc_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_tlb.clk_domain.clock 1000 # Clock period in ticks +system.sqc_tlb.local_TLB_accesses 66 # Number of TLB accesses +system.sqc_tlb.local_TLB_hits 65 # Number of TLB hits +system.sqc_tlb.local_TLB_misses 1 # Number of TLB misses +system.sqc_tlb.local_TLB_miss_rate 1.515152 # TLB miss rate +system.sqc_tlb.global_TLB_accesses 86 # Number of TLB accesses +system.sqc_tlb.global_TLB_hits 78 # Number of TLB hits +system.sqc_tlb.global_TLB_misses 8 # Number of TLB misses +system.sqc_tlb.global_TLB_miss_rate 9.302326 # TLB miss rate +system.sqc_tlb.access_cycles 86008 # Cycles spent accessing this TLB level +system.sqc_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.sqc_tlb.unique_pages 1 # Number of unique pages touched +system.sqc_tlb.local_cycles 66001 # Number of cycles spent in queue for all incoming reqs +system.sqc_tlb.local_latency 1000.015152 # Avg. latency over incoming coalesced reqs +system.sqc_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.ruby.network.ext_links0.int_node.throttle0.link_utilization 0.074413 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Data::0 18 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0 1542 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2 2 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2 2 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4 1541 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Data::0 1296 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0 12336 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2 144 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2 16 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4 12328 +system.ruby.network.ext_links0.int_node.throttle1.link_utilization 0.314928 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Control::0 3 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Data::2 1535 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Control::0 24 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Data::2 110520 +system.ruby.network.ext_links0.int_node.throttle2.link_utilization 0.002234 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Control::0 1 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2 9 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Control::0 8 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links1.int_node.throttle0.link_utilization 0.314928 +system.ruby.network.ext_links1.int_node.throttle0.msg_count.Control::0 3 +system.ruby.network.ext_links1.int_node.throttle0.msg_count.Response_Data::2 1535 +system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Control::0 24 +system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Response_Data::2 110520 +system.ruby.network.ext_links1.int_node.throttle1.link_utilization 0.070379 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Request_Control::0 1535 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Data::2 2 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Control::2 1 +system.ruby.network.ext_links1.int_node.throttle1.msg_count.Unblock_Control::4 1534 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Request_Control::0 12280 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Data::2 144 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Control::2 8 +system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Unblock_Control::4 12272 +system.ruby.network.ext_links2.int_node.throttle0.link_utilization 0.000798 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::3 3 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Writeback_Control::3 8 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::3 216 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Writeback_Control::3 64 +system.ruby.network.ext_links2.int_node.throttle1.link_utilization 0.000798 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Data::3 3 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Writeback_Control::3 8 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Data::3 216 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Writeback_Control::3 64 +system.ruby.network.ext_links2.int_node.throttle2.link_utilization 0.006131 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Control::0 1 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Data::1 18 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Request_Control::1 9 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Data::2 9 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Control::0 8 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Data::1 1296 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Request_Control::1 72 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links2.int_node.throttle3.link_utilization 0.001026 +system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::3 5 +system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::3 360 +system.ruby.network.ext_links2.int_node.throttle4.link_utilization 0.004034 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Data::0 18 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Request_Control::0 7 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Response_Control::2 1 +system.ruby.network.ext_links2.int_node.throttle4.msg_count.Unblock_Control::4 7 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Data::0 1296 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Request_Control::0 56 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Response_Control::2 8 +system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Unblock_Control::4 56 +system.ruby.CorePair_Controller.C0_Load_L1miss 180 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Load_L1hit 16155 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1hit 86007 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1miss 1088 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1hit 10448 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckS 1034 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckM 326 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckE 175 0.00% 0.00% +system.ruby.CorePair_Controller.L1I_Repl 589 0.00% 0.00% +system.ruby.CorePair_Controller.L1D0_Repl 24 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1D0 5 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1I 54 0.00% 0.00% +system.ruby.CorePair_Controller.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.PrbShrData 2 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Load_L1miss 175 0.00% 0.00% +system.ruby.CorePair_Controller.I.Ifetch0_L1miss 1034 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1hit 86007 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1miss 54 0.00% 0.00% +system.ruby.CorePair_Controller.S.L1I_Repl 589 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1hit 3356 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Store_L1hit 46 0.00% 0.00% +system.ruby.CorePair_Controller.E0.L1D0_Repl 16 0.00% 0.00% +system.ruby.CorePair_Controller.E0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.O.C0_Load_L1hit 3 0.00% 0.00% +system.ruby.CorePair_Controller.O.C0_Store_L1hit 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1miss 3 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1hit 12796 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Store_L1hit 10401 0.00% 0.00% +system.ruby.CorePair_Controller.M0.L1D0_Repl 8 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.NB_AckM 325 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckE 175 0.00% 0.00% +system.ruby.CorePair_Controller.Si_F0.L2_to_L1I 54 0.00% 0.00% +system.ruby.CorePair_Controller.O_M0.NB_AckM 1 0.00% 0.00% +system.ruby.CorePair_Controller.S0.NB_AckS 1034 0.00% 0.00% +system.ruby.CorePair_Controller.E0_F.L2_to_L1D0 2 0.00% 0.00% +system.ruby.CorePair_Controller.M0_F.L2_to_L1D0 3 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkS 1034 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkM 326 0.00% 0.00% +system.ruby.Directory_Controller.RdBlk 182 0.00% 0.00% +system.ruby.Directory_Controller.WriteThrough 16 0.00% 0.00% +system.ruby.Directory_Controller.Atomic 3 0.00% 0.00% +system.ruby.Directory_Controller.CPUPrbResp 4 0.00% 0.00% +system.ruby.Directory_Controller.ProbeAcksComplete 1560 0.00% 0.00% +system.ruby.Directory_Controller.MemData 1560 0.00% 0.00% +system.ruby.Directory_Controller.CoreUnblock 1541 0.00% 0.00% +system.ruby.Directory_Controller.UnblockWriteThrough 18 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkS 1034 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkM 326 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlk 182 0.00% 0.00% +system.ruby.Directory_Controller.U.WriteThrough 16 0.00% 0.00% +system.ruby.Directory_Controller.U.Atomic 2 0.00% 0.00% +system.ruby.Directory_Controller.BS_M.MemData 1034 0.00% 0.00% +system.ruby.Directory_Controller.BM_M.MemData 343 0.00% 0.00% +system.ruby.Directory_Controller.B_M.MemData 180 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete 1034 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.Atomic 1 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.CPUPrbResp 1 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete 343 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.MemData 1 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.ProbeAcksComplete 180 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.MemData 2 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.CPUPrbResp 1 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete 1 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.CPUPrbResp 2 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete 2 0.00% 0.00% +system.ruby.Directory_Controller.B.CoreUnblock 1541 0.00% 0.00% +system.ruby.Directory_Controller.B.UnblockWriteThrough 18 0.00% 0.00% +system.ruby.LD.latency_hist::bucket_size 64 +system.ruby.LD.latency_hist::max_bucket 639 +system.ruby.LD.latency_hist::samples 16335 +system.ruby.LD.latency_hist::mean 3.253444 +system.ruby.LD.latency_hist::gmean 1.059859 +system.ruby.LD.latency_hist::stdev 21.887471 +system.ruby.LD.latency_hist | 16160 98.93% 98.93% | 0 0.00% 98.93% | 0 0.00% 98.93% | 170 1.04% 99.97% | 1 0.01% 99.98% | 1 0.01% 99.98% | 2 0.01% 99.99% | 1 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.latency_hist::total 16335 +system.ruby.LD.hit_latency_hist::bucket_size 64 +system.ruby.LD.hit_latency_hist::max_bucket 639 +system.ruby.LD.hit_latency_hist::samples 175 +system.ruby.LD.hit_latency_hist::mean 210.828571 +system.ruby.LD.hit_latency_hist::gmean 209.031405 +system.ruby.LD.hit_latency_hist::stdev 34.022715 +system.ruby.LD.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 170 97.14% 97.14% | 1 0.57% 97.71% | 1 0.57% 98.29% | 2 1.14% 99.43% | 1 0.57% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.hit_latency_hist::total 175 +system.ruby.LD.miss_latency_hist::bucket_size 2 +system.ruby.LD.miss_latency_hist::max_bucket 19 +system.ruby.LD.miss_latency_hist::samples 16160 +system.ruby.LD.miss_latency_hist::mean 1.005569 +system.ruby.LD.miss_latency_hist::gmean 1.000911 +system.ruby.LD.miss_latency_hist::stdev 0.316580 +system.ruby.LD.miss_latency_hist | 16155 99.97% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 0 0.00% 99.97% | 5 0.03% 100.00% +system.ruby.LD.miss_latency_hist::total 16160 +system.ruby.ST.latency_hist::bucket_size 64 +system.ruby.ST.latency_hist::max_bucket 639 +system.ruby.ST.latency_hist::samples 10412 +system.ruby.ST.latency_hist::mean 7.384076 +system.ruby.ST.latency_hist::gmean 1.178989 +system.ruby.ST.latency_hist::stdev 36.341010 +system.ruby.ST.latency_hist | 10090 96.91% 96.91% | 0 0.00% 96.91% | 0 0.00% 96.91% | 309 2.97% 99.88% | 4 0.04% 99.91% | 2 0.02% 99.93% | 3 0.03% 99.96% | 4 0.04% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.latency_hist::total 10412 +system.ruby.ST.hit_latency_hist::bucket_size 64 +system.ruby.ST.hit_latency_hist::max_bucket 639 +system.ruby.ST.hit_latency_hist::samples 322 +system.ruby.ST.hit_latency_hist::mean 207.431677 +system.ruby.ST.hit_latency_hist::gmean 205.258691 +system.ruby.ST.hit_latency_hist::stdev 37.529677 +system.ruby.ST.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 309 95.96% 95.96% | 4 1.24% 97.20% | 2 0.62% 97.83% | 3 0.93% 98.76% | 4 1.24% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.hit_latency_hist::total 322 +system.ruby.ST.miss_latency_hist::bucket_size 1 +system.ruby.ST.miss_latency_hist::max_bucket 9 +system.ruby.ST.miss_latency_hist::samples 10090 +system.ruby.ST.miss_latency_hist::mean 1 +system.ruby.ST.miss_latency_hist::gmean 1 +system.ruby.ST.miss_latency_hist | 0 0.00% 0.00% | 10090 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.miss_latency_hist::total 10090 +system.ruby.IFETCH.latency_hist::bucket_size 64 +system.ruby.IFETCH.latency_hist::max_bucket 639 +system.ruby.IFETCH.latency_hist::samples 87095 +system.ruby.IFETCH.latency_hist::mean 3.432677 +system.ruby.IFETCH.latency_hist::gmean 1.067087 +system.ruby.IFETCH.latency_hist::stdev 22.344689 +system.ruby.IFETCH.latency_hist | 86061 98.81% 98.81% | 0 0.00% 98.81% | 0 0.00% 98.81% | 1006 1.16% 99.97% | 5 0.01% 99.97% | 10 0.01% 99.99% | 11 0.01% 100.00% | 2 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.latency_hist::total 87095 +system.ruby.IFETCH.hit_latency_hist::bucket_size 64 +system.ruby.IFETCH.hit_latency_hist::max_bucket 639 +system.ruby.IFETCH.hit_latency_hist::samples 1034 +system.ruby.IFETCH.hit_latency_hist::mean 204.967118 +system.ruby.IFETCH.hit_latency_hist::gmean 203.475698 +system.ruby.IFETCH.hit_latency_hist::stdev 30.573589 +system.ruby.IFETCH.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1006 97.29% 97.29% | 5 0.48% 97.78% | 10 0.97% 98.74% | 11 1.06% 99.81% | 2 0.19% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.hit_latency_hist::total 1034 +system.ruby.IFETCH.miss_latency_hist::bucket_size 2 +system.ruby.IFETCH.miss_latency_hist::max_bucket 19 +system.ruby.IFETCH.miss_latency_hist::samples 86061 +system.ruby.IFETCH.miss_latency_hist::mean 1.011294 +system.ruby.IFETCH.miss_latency_hist::gmean 1.001849 +system.ruby.IFETCH.miss_latency_hist::stdev 0.450747 +system.ruby.IFETCH.miss_latency_hist | 86007 99.94% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 0 0.00% 99.94% | 54 0.06% 100.00% +system.ruby.IFETCH.miss_latency_hist::total 86061 +system.ruby.RMW_Read.latency_hist::bucket_size 32 +system.ruby.RMW_Read.latency_hist::max_bucket 319 +system.ruby.RMW_Read.latency_hist::samples 341 +system.ruby.RMW_Read.latency_hist::mean 3.451613 +system.ruby.RMW_Read.latency_hist::gmean 1.064718 +system.ruby.RMW_Read.latency_hist::stdev 22.561449 +system.ruby.RMW_Read.latency_hist | 337 98.83% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 3 0.88% 99.71% | 1 0.29% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.latency_hist::total 341 +system.ruby.RMW_Read.hit_latency_hist::bucket_size 32 +system.ruby.RMW_Read.hit_latency_hist::max_bucket 319 +system.ruby.RMW_Read.hit_latency_hist::samples 4 +system.ruby.RMW_Read.hit_latency_hist::mean 210 +system.ruby.RMW_Read.hit_latency_hist::gmean 209.766277 +system.ruby.RMW_Read.hit_latency_hist::stdev 11.430952 +system.ruby.RMW_Read.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 3 75.00% 75.00% | 1 25.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.hit_latency_hist::total 4 +system.ruby.RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.RMW_Read.miss_latency_hist::samples 337 +system.ruby.RMW_Read.miss_latency_hist::mean 1 +system.ruby.RMW_Read.miss_latency_hist::gmean 1 +system.ruby.RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.miss_latency_hist::total 337 +system.ruby.Locked_RMW_Read.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.latency_hist::samples 10 +system.ruby.Locked_RMW_Read.latency_hist::mean 1 +system.ruby.Locked_RMW_Read.latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.latency_hist::total 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::mean 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.miss_latency_hist::total 10 +system.ruby.Locked_RMW_Write.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.latency_hist::samples 10 +system.ruby.Locked_RMW_Write.latency_hist::mean 1 +system.ruby.Locked_RMW_Write.latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.latency_hist::total 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::mean 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.miss_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.miss_latency_hist::total 10 +system.ruby.L1Cache.miss_mach_latency_hist::bucket_size 1 +system.ruby.L1Cache.miss_mach_latency_hist::max_bucket 9 +system.ruby.L1Cache.miss_mach_latency_hist::samples 112609 +system.ruby.L1Cache.miss_mach_latency_hist::mean 1 +system.ruby.L1Cache.miss_mach_latency_hist::gmean 1 +system.ruby.L1Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 112609 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L1Cache.miss_mach_latency_hist::total 112609 +system.ruby.L2Cache.miss_mach_latency_hist::bucket_size 2 +system.ruby.L2Cache.miss_mach_latency_hist::max_bucket 19 +system.ruby.L2Cache.miss_mach_latency_hist::samples 59 +system.ruby.L2Cache.miss_mach_latency_hist::mean 19 +system.ruby.L2Cache.miss_mach_latency_hist::gmean 19.000000 +system.ruby.L2Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 59 100.00% 100.00% +system.ruby.L2Cache.miss_mach_latency_hist::total 59 +system.ruby.Directory.hit_mach_latency_hist::bucket_size 64 +system.ruby.Directory.hit_mach_latency_hist::max_bucket 639 +system.ruby.Directory.hit_mach_latency_hist::samples 1535 +system.ruby.Directory.hit_mach_latency_hist::mean 206.165472 +system.ruby.Directory.hit_mach_latency_hist::gmean 204.491657 +system.ruby.Directory.hit_mach_latency_hist::stdev 32.551053 +system.ruby.Directory.hit_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1489 97.00% 97.00% | 10 0.65% 97.65% | 13 0.85% 98.50% | 16 1.04% 99.54% | 7 0.46% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Directory.hit_mach_latency_hist::total 1535 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples 16155 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 16155 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total 16155 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size 2 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket 19 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples 5 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean 19 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean 19.000000 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 5 100.00% 100.00% +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total 5 +system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.LD.Directory.hit_type_mach_latency_hist::samples 175 +system.ruby.LD.Directory.hit_type_mach_latency_hist::mean 210.828571 +system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean 209.031405 +system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev 34.022715 +system.ruby.LD.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 170 97.14% 97.14% | 1 0.57% 97.71% | 1 0.57% 98.29% | 2 1.14% 99.43% | 1 0.57% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.Directory.hit_type_mach_latency_hist::total 175 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples 10090 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10090 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total 10090 +system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.ST.Directory.hit_type_mach_latency_hist::samples 322 +system.ruby.ST.Directory.hit_type_mach_latency_hist::mean 207.431677 +system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean 205.258691 +system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev 37.529677 +system.ruby.ST.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 309 95.96% 95.96% | 4 1.24% 97.20% | 2 0.62% 97.83% | 3 0.93% 98.76% | 4 1.24% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.Directory.hit_type_mach_latency_hist::total 322 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples 86007 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 86007 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total 86007 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size 2 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket 19 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples 54 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean 19 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean 19.000000 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 54 100.00% 100.00% +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total 54 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples 1034 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean 204.967118 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean 203.475698 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev 30.573589 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1006 97.29% 97.29% | 5 0.48% 97.78% | 10 0.97% 98.74% | 11 1.06% 99.81% | 2 0.19% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total 1034 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 337 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total 337 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size 32 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket 319 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples 4 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean 210 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean 209.766277 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::stdev 11.430952 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 3 75.00% 75.00% | 1 25.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total 4 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.SQC_Controller.Fetch 86 0.00% 0.00% +system.ruby.SQC_Controller.Data 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Fetch 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Data 5 0.00% 0.00% +system.ruby.SQC_Controller.V.Fetch 81 0.00% 0.00% +system.ruby.TCC_Controller.RdBlk 9 0.00% 0.00% +system.ruby.TCC_Controller.WrVicBlk 16 0.00% 0.00% +system.ruby.TCC_Controller.Atomic 2 0.00% 0.00% +system.ruby.TCC_Controller.AtomicDone 1 0.00% 0.00% +system.ruby.TCC_Controller.Data 9 0.00% 0.00% +system.ruby.TCC_Controller.PrbInv 1 0.00% 0.00% +system.ruby.TCC_Controller.WBAck 16 0.00% 0.00% +system.ruby.TCC_Controller.V.PrbInv 1 0.00% 0.00% +system.ruby.TCC_Controller.I.RdBlk 7 0.00% 0.00% +system.ruby.TCC_Controller.I.WrVicBlk 16 0.00% 0.00% +system.ruby.TCC_Controller.I.Atomic 1 0.00% 0.00% +system.ruby.TCC_Controller.I.WBAck 16 0.00% 0.00% +system.ruby.TCC_Controller.IV.RdBlk 2 0.00% 0.00% +system.ruby.TCC_Controller.IV.Data 7 0.00% 0.00% +system.ruby.TCC_Controller.A.Atomic 1 0.00% 0.00% +system.ruby.TCC_Controller.A.AtomicDone 1 0.00% 0.00% +system.ruby.TCC_Controller.A.Data 2 0.00% 0.00% +system.ruby.TCP_Controller.Load | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.Load::total 10 +system.ruby.TCP_Controller.StoreThrough | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.StoreThrough::total 16 +system.ruby.TCP_Controller.Atomic | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.Atomic::total 2 +system.ruby.TCP_Controller.Flush | 768 50.00% 50.00% | 768 50.00% 100.00% +system.ruby.TCP_Controller.Flush::total 1536 +system.ruby.TCP_Controller.Evict | 512 50.00% 50.00% | 512 50.00% 100.00% +system.ruby.TCP_Controller.Evict::total 1024 +system.ruby.TCP_Controller.TCC_Ack | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.TCC_Ack::total 6 +system.ruby.TCP_Controller.TCC_AckWB | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.TCC_AckWB::total 16 +system.ruby.TCP_Controller.I.Load | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.Load::total 4 +system.ruby.TCP_Controller.I.StoreThrough | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.I.StoreThrough::total 16 +system.ruby.TCP_Controller.I.Atomic | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.I.Atomic::total 2 +system.ruby.TCP_Controller.I.Flush | 766 50.00% 50.00% | 766 50.00% 100.00% +system.ruby.TCP_Controller.I.Flush::total 1532 +system.ruby.TCP_Controller.I.Evict | 510 50.00% 50.00% | 510 50.00% 100.00% +system.ruby.TCP_Controller.I.Evict::total 1020 +system.ruby.TCP_Controller.I.TCC_Ack | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.TCC_Ack::total 4 +system.ruby.TCP_Controller.I.TCC_AckWB | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.I.TCC_AckWB::total 16 +system.ruby.TCP_Controller.V.Load | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.V.Load::total 6 +system.ruby.TCP_Controller.V.Flush | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.V.Flush::total 4 +system.ruby.TCP_Controller.V.Evict | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.V.Evict::total 4 +system.ruby.TCP_Controller.A.TCC_Ack | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.A.TCC_Ack::total 2 + +---------- End Simulation Statistics ---------- diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/config.ini new file mode 100644 index 000000000..38646dce2 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/config.ini @@ -0,0 +1,5094 @@ +[root] +type=Root +children=system +eventq_index=0 +full_system=false +sim_quantum=0 +time_sync_enable=false +time_sync_period=100000000000 +time_sync_spin_threshold=100000000 + +[system] +type=System +children=clk_domain cp_cntrl0 cpu0 cpu1 cpu2 dir_cntrl0 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus rb_cntrl0 reg_cntrl0 ruby sqc_cntrl0 sqc_coalescer sqc_tlb sys_port_proxy tcc_cntrl0 tcc_rb_cntrl0 tcp_cntrl0 tcp_cntrl1 voltage_domain +boot_osflags=a +cache_line_size=64 +clk_domain=system.clk_domain +eventq_index=0 +exit_on_work_items=false +init_param=0 +kernel= +kernel_addr_check=true +load_addr_mask=1099511627775 +load_offset=0 +mem_mode=timing +mem_ranges=0:536870911 +memories=system.mem_ctrls system.ruby.phys_mem +mmap_using_noreserve=false +multi_thread=false +num_work_ids=16 +readfile= +symbolfile= +work_begin_ckpt_count=0 +work_begin_cpu_id_exit=-1 +work_begin_exit_count=0 +work_cpus_ckpt_count=0 +work_end_ckpt_count=0 +work_end_exit_count=0 +work_item_id=-1 +system_port=system.sys_port_proxy.slave[0] + +[system.clk_domain] +type=SrcClockDomain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cp_cntrl0] +type=CorePair_Controller +children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore +L1D0cache=system.cp_cntrl0.L1D0cache +L1D1cache=system.cp_cntrl0.L1D1cache +L1Icache=system.cp_cntrl0.L1Icache +L2cache=system.cp_cntrl0.L2cache +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.cp_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToCore=system.cp_cntrl0.probeToCore +recycle_latency=10 +regionBufferNum=0 +requestFromCore=system.cp_cntrl0.requestFromCore +responseFromCore=system.cp_cntrl0.responseFromCore +responseToCore=system.cp_cntrl0.responseToCore +ruby_system=system.ruby +send_evictions=true +sequencer=system.cp_cntrl0.sequencer +sequencer1=system.cp_cntrl0.sequencer1 +system=system +transitions_per_cycle=32 +triggerQueue=system.cp_cntrl0.triggerQueue +unblockFromCore=system.cp_cntrl0.unblockFromCore +version=0 + +[system.cp_cntrl0.L1D0cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.cp_cntrl0.L1D0cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.cp_cntrl0.L1D1cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=65536 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.cp_cntrl0.L1D1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=65536 + +[system.cp_cntrl0.L1Icache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=2 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=2 + +[system.cp_cntrl0.L1Icache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=32768 + +[system.cp_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L2cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=2097152 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.cp_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=2097152 + +[system.cp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.cp_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[0] + +[system.cp_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[0] + +[system.cp_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[1] + +[system.cp_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[1] + +[system.cp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=0 +dcache=system.cp_cntrl0.L1D0cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=0 +master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave +mem_master_port=system.piobus.slave[0] +slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master + +[system.cp_cntrl0.sequencer1] +type=RubySequencer +clk_domain=system.clk_domain +coreid=1 +dcache=system.cp_cntrl0.L1D1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=1 + +[system.cp_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.cp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[2] + +[system.cpu0] +type=TimingSimpleCPU +children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload +branchPred=Null +checker=Null +clk_domain=system.cpu0.clk_domain +cpu_id=0 +do_checkpoint_insts=true +do_quiesce=true +do_statistics_insts=true +dtb=system.cpu0.dtb +eventq_index=0 +function_trace=false +function_trace_start=0 +interrupts=system.cpu0.interrupts +isa=system.cpu0.isa +itb=system.cpu0.itb +max_insts_all_threads=0 +max_insts_any_thread=0 +max_loads_all_threads=0 +max_loads_any_thread=0 +numThreads=1 +profile=0 +progress_interval=0 +simpoint_start_insts= +socket_id=0 +switched_out=false +system=system +tracer=system.cpu0.tracer +workload=system.cpu0.workload +dcache_port=system.cp_cntrl0.sequencer.slave[1] +icache_port=system.cp_cntrl0.sequencer.slave[0] + +[system.cpu0.apic_clk_domain] +type=DerivedClockDomain +clk_divider=16 +clk_domain=system.cpu0.clk_domain +eventq_index=0 + +[system.cpu0.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cpu0.dtb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.dtb.walker + +[system.cpu0.dtb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.cp_cntrl0.sequencer.slave[3] + +[system.cpu0.interrupts] +type=X86LocalApic +clk_domain=system.cpu0.apic_clk_domain +eventq_index=0 +int_latency=1000 +pio_addr=2305843009213693952 +pio_latency=100000 +system=system +int_master=system.cp_cntrl0.sequencer.slave[4] +int_slave=system.cp_cntrl0.sequencer.master[1] +pio=system.cp_cntrl0.sequencer.master[0] + +[system.cpu0.isa] +type=X86ISA +eventq_index=0 + +[system.cpu0.itb] +type=X86TLB +children=walker +eventq_index=0 +size=64 +walker=system.cpu0.itb.walker + +[system.cpu0.itb.walker] +type=X86PagetableWalker +clk_domain=system.cpu0.clk_domain +eventq_index=0 +num_squash_per_cycle=4 +system=system +port=system.cp_cntrl0.sequencer.slave[2] + +[system.cpu0.tracer] +type=ExeTracer +eventq_index=0 + +[system.cpu0.workload] +type=LiveProcess +cmd=gpu-hello +cwd= +drivers=system.cpu2.cl_driver +egid=100 +env= +errout=cerr +euid=100 +eventq_index=0 +executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello +gid=100 +input=cin +kvmInSE=false +max_stack_size=67108864 +output=cout +pid=100 +ppid=99 +simpoint=0 +system=system +uid=100 +useArchPT=false + +[system.cpu1] +type=Shader +children=CUs0 CUs1 clk_domain +CUs=system.cpu1.CUs0 system.cpu1.CUs1 +clk_domain=system.cpu1.clk_domain +cpu_pointer=system.cpu0 +eventq_index=0 +globalmem=65536 +impl_kern_boundary_sync=true +n_wf=8 +separate_acquire_release=false +timing=true +translation=false + +[system.cpu1.CUs0] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=0 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs0.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs0.ldsBus.slave +memory_port=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl0.coalescer.slave[1] system.tcp_cntrl0.coalescer.slave[2] system.tcp_cntrl0.coalescer.slave[3] system.tcp_cntrl0.coalescer.slave[4] system.tcp_cntrl0.coalescer.slave[5] system.tcp_cntrl0.coalescer.slave[6] system.tcp_cntrl0.coalescer.slave[7] system.tcp_cntrl0.coalescer.slave[8] system.tcp_cntrl0.coalescer.slave[9] system.tcp_cntrl0.coalescer.slave[10] system.tcp_cntrl0.coalescer.slave[11] system.tcp_cntrl0.coalescer.slave[12] system.tcp_cntrl0.coalescer.slave[13] system.tcp_cntrl0.coalescer.slave[14] system.tcp_cntrl0.coalescer.slave[15] system.tcp_cntrl0.coalescer.slave[16] system.tcp_cntrl0.coalescer.slave[17] system.tcp_cntrl0.coalescer.slave[18] system.tcp_cntrl0.coalescer.slave[19] system.tcp_cntrl0.coalescer.slave[20] system.tcp_cntrl0.coalescer.slave[21] system.tcp_cntrl0.coalescer.slave[22] system.tcp_cntrl0.coalescer.slave[23] system.tcp_cntrl0.coalescer.slave[24] system.tcp_cntrl0.coalescer.slave[25] system.tcp_cntrl0.coalescer.slave[26] system.tcp_cntrl0.coalescer.slave[27] system.tcp_cntrl0.coalescer.slave[28] system.tcp_cntrl0.coalescer.slave[29] system.tcp_cntrl0.coalescer.slave[30] system.tcp_cntrl0.coalescer.slave[31] system.tcp_cntrl0.coalescer.slave[32] system.tcp_cntrl0.coalescer.slave[33] system.tcp_cntrl0.coalescer.slave[34] system.tcp_cntrl0.coalescer.slave[35] system.tcp_cntrl0.coalescer.slave[36] system.tcp_cntrl0.coalescer.slave[37] system.tcp_cntrl0.coalescer.slave[38] system.tcp_cntrl0.coalescer.slave[39] system.tcp_cntrl0.coalescer.slave[40] system.tcp_cntrl0.coalescer.slave[41] system.tcp_cntrl0.coalescer.slave[42] system.tcp_cntrl0.coalescer.slave[43] system.tcp_cntrl0.coalescer.slave[44] system.tcp_cntrl0.coalescer.slave[45] system.tcp_cntrl0.coalescer.slave[46] system.tcp_cntrl0.coalescer.slave[47] system.tcp_cntrl0.coalescer.slave[48] system.tcp_cntrl0.coalescer.slave[49] system.tcp_cntrl0.coalescer.slave[50] system.tcp_cntrl0.coalescer.slave[51] system.tcp_cntrl0.coalescer.slave[52] system.tcp_cntrl0.coalescer.slave[53] system.tcp_cntrl0.coalescer.slave[54] system.tcp_cntrl0.coalescer.slave[55] system.tcp_cntrl0.coalescer.slave[56] system.tcp_cntrl0.coalescer.slave[57] system.tcp_cntrl0.coalescer.slave[58] system.tcp_cntrl0.coalescer.slave[59] system.tcp_cntrl0.coalescer.slave[60] system.tcp_cntrl0.coalescer.slave[61] system.tcp_cntrl0.coalescer.slave[62] system.tcp_cntrl0.coalescer.slave[63] +sqc_port=system.sqc_cntrl0.sequencer.slave[0] +sqc_tlb_port=system.sqc_coalescer.slave[0] +translation_port=system.l1_coalescer0.slave[0] + +[system.cpu1.CUs0.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs0.localDataStore.cuPort +slave=system.cpu1.CUs0.ldsPort + +[system.cpu1.CUs0.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs0.ldsBus.master + +[system.cpu1.CUs0.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs0.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs0.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs0.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs0.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs0.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs0.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs0.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs0.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs0.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs0.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs0.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs0.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.CUs1] +type=ComputeUnit +children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31 +clk_domain=system.cpu1.clk_domain +coalescer_to_vrf_bus_width=32 +countPages=false +cu_id=1 +debugSegFault=false +dpbypass_pipe_length=4 +eventq_index=0 +execPolicy=OLDEST-FIRST +functionalTLB=true +global_mem_queue_size=256 +issue_period=4 +localDataStore=system.cpu1.CUs1.localDataStore +localMemBarrier=false +local_mem_queue_size=256 +mem_req_latency=9 +mem_resp_latency=9 +n_wf=8 +num_SIMDs=4 +num_global_mem_pipes=1 +num_shared_mem_pipes=1 +perLaneTLB=false +prefetch_depth=0 +prefetch_prev_type=PF_PHASE +prefetch_stride=1 +spbypass_pipe_length=4 +system=system +vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3 +vrf_to_coalescer_bus_width=32 +wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31 +wfSize=64 +xactCasMode=false +ldsPort=system.cpu1.CUs1.ldsBus.slave +memory_port=system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[1] system.tcp_cntrl1.coalescer.slave[2] system.tcp_cntrl1.coalescer.slave[3] system.tcp_cntrl1.coalescer.slave[4] system.tcp_cntrl1.coalescer.slave[5] system.tcp_cntrl1.coalescer.slave[6] system.tcp_cntrl1.coalescer.slave[7] system.tcp_cntrl1.coalescer.slave[8] system.tcp_cntrl1.coalescer.slave[9] system.tcp_cntrl1.coalescer.slave[10] system.tcp_cntrl1.coalescer.slave[11] system.tcp_cntrl1.coalescer.slave[12] system.tcp_cntrl1.coalescer.slave[13] system.tcp_cntrl1.coalescer.slave[14] system.tcp_cntrl1.coalescer.slave[15] system.tcp_cntrl1.coalescer.slave[16] system.tcp_cntrl1.coalescer.slave[17] system.tcp_cntrl1.coalescer.slave[18] system.tcp_cntrl1.coalescer.slave[19] system.tcp_cntrl1.coalescer.slave[20] system.tcp_cntrl1.coalescer.slave[21] system.tcp_cntrl1.coalescer.slave[22] system.tcp_cntrl1.coalescer.slave[23] system.tcp_cntrl1.coalescer.slave[24] system.tcp_cntrl1.coalescer.slave[25] system.tcp_cntrl1.coalescer.slave[26] system.tcp_cntrl1.coalescer.slave[27] system.tcp_cntrl1.coalescer.slave[28] system.tcp_cntrl1.coalescer.slave[29] system.tcp_cntrl1.coalescer.slave[30] system.tcp_cntrl1.coalescer.slave[31] system.tcp_cntrl1.coalescer.slave[32] system.tcp_cntrl1.coalescer.slave[33] system.tcp_cntrl1.coalescer.slave[34] system.tcp_cntrl1.coalescer.slave[35] system.tcp_cntrl1.coalescer.slave[36] system.tcp_cntrl1.coalescer.slave[37] system.tcp_cntrl1.coalescer.slave[38] system.tcp_cntrl1.coalescer.slave[39] system.tcp_cntrl1.coalescer.slave[40] system.tcp_cntrl1.coalescer.slave[41] system.tcp_cntrl1.coalescer.slave[42] system.tcp_cntrl1.coalescer.slave[43] system.tcp_cntrl1.coalescer.slave[44] system.tcp_cntrl1.coalescer.slave[45] system.tcp_cntrl1.coalescer.slave[46] system.tcp_cntrl1.coalescer.slave[47] system.tcp_cntrl1.coalescer.slave[48] system.tcp_cntrl1.coalescer.slave[49] system.tcp_cntrl1.coalescer.slave[50] system.tcp_cntrl1.coalescer.slave[51] system.tcp_cntrl1.coalescer.slave[52] system.tcp_cntrl1.coalescer.slave[53] system.tcp_cntrl1.coalescer.slave[54] system.tcp_cntrl1.coalescer.slave[55] system.tcp_cntrl1.coalescer.slave[56] system.tcp_cntrl1.coalescer.slave[57] system.tcp_cntrl1.coalescer.slave[58] system.tcp_cntrl1.coalescer.slave[59] system.tcp_cntrl1.coalescer.slave[60] system.tcp_cntrl1.coalescer.slave[61] system.tcp_cntrl1.coalescer.slave[62] system.tcp_cntrl1.coalescer.slave[63] +sqc_port=system.sqc_cntrl0.sequencer.slave[1] +sqc_tlb_port=system.sqc_coalescer.slave[1] +translation_port=system.l1_coalescer1.slave[0] + +[system.cpu1.CUs1.ldsBus] +type=Bridge +clk_domain=system.cpu1.clk_domain +delay=0 +eventq_index=0 +ranges=0:18446744073709551615 +req_size=16 +resp_size=16 +master=system.cpu1.CUs1.localDataStore.cuPort +slave=system.cpu1.CUs1.ldsPort + +[system.cpu1.CUs1.localDataStore] +type=LdsState +bankConflictPenalty=1 +banks=32 +clk_domain=system.cpu1.clk_domain +eventq_index=0 +range=0:65535 +size=65536 +cuPort=system.cpu1.CUs1.ldsBus.master + +[system.cpu1.CUs1.vector_register_file0] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=0 + +[system.cpu1.CUs1.vector_register_file1] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=1 + +[system.cpu1.CUs1.vector_register_file2] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=2 + +[system.cpu1.CUs1.vector_register_file3] +type=VectorRegisterFile +eventq_index=0 +min_alloc=4 +num_regs_per_simd=2048 +simd_id=3 + +[system.cpu1.CUs1.wavefronts00] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts01] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts02] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts03] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts04] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts05] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts06] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts07] +type=Wavefront +eventq_index=0 +simdId=0 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts08] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts09] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts10] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts11] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts12] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts13] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts14] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts15] +type=Wavefront +eventq_index=0 +simdId=1 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts16] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts17] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts18] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts19] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts20] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts21] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts22] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts23] +type=Wavefront +eventq_index=0 +simdId=2 +wf_slot_id=7 + +[system.cpu1.CUs1.wavefronts24] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=0 + +[system.cpu1.CUs1.wavefronts25] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=1 + +[system.cpu1.CUs1.wavefronts26] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=2 + +[system.cpu1.CUs1.wavefronts27] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=3 + +[system.cpu1.CUs1.wavefronts28] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=4 + +[system.cpu1.CUs1.wavefronts29] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=5 + +[system.cpu1.CUs1.wavefronts30] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=6 + +[system.cpu1.CUs1.wavefronts31] +type=Wavefront +eventq_index=0 +simdId=3 +wf_slot_id=7 + +[system.cpu1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.cpu1.clk_domain.voltage_domain + +[system.cpu1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.cpu2] +type=GpuDispatcher +children=cl_driver +cl_driver=system.cpu2.cl_driver +clk_domain=system.clk_domain +cpu=system.cpu0 +eventq_index=0 +pio_addr=8589934592 +pio_latency=1000 +shader_pointer=system.cpu1 +system=system +dma=system.piobus.slave[1] +pio=system.piobus.master[0] +translation_port=system.dispatcher_coalescer.slave[0] + +[system.cpu2.cl_driver] +type=ClDriver +codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +eventq_index=0 +filename=hsa + +[system.dir_cntrl0] +type=Directory_Controller +children=L3CacheMemory L3triggerQueue directory probeToCore reqFromRegBuf reqFromRegDir reqToRegDir requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores unblockToRegDir +L3CacheMemory=system.dir_cntrl0.L3CacheMemory +L3triggerQueue=system.dir_cntrl0.L3triggerQueue +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +directory=system.dir_cntrl0.directory +eventq_index=0 +l3_hit_latency=15 +number_of_TBEs=5120 +probeToCore=system.dir_cntrl0.probeToCore +recycle_latency=10 +reqFromRegBuf=system.dir_cntrl0.reqFromRegBuf +reqFromRegDir=system.dir_cntrl0.reqFromRegDir +reqToRegDir=system.dir_cntrl0.reqToRegDir +requestFromCores=system.dir_cntrl0.requestFromCores +responseFromCores=system.dir_cntrl0.responseFromCores +responseFromMemory=system.dir_cntrl0.responseFromMemory +responseToCore=system.dir_cntrl0.responseToCore +response_latency=25 +response_latency_regionDir=1 +ruby_system=system.ruby +system=system +to_memory_controller_latency=1 +transitions_per_cycle=32 +triggerQueue=system.dir_cntrl0.triggerQueue +unblockFromCores=system.dir_cntrl0.unblockFromCores +unblockToRegDir=system.dir_cntrl0.unblockToRegDir +useL3OnWT=false +version=0 +memory=system.mem_ctrls.port + +[system.dir_cntrl0.L3CacheMemory] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=20 +dataArrayBanks=16.0 +eventq_index=0 +is_icache=false +replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=16777216 +start_index_bit=6 +tagAccessLatency=15 +tagArrayBanks=16.0 + +[system.dir_cntrl0.L3CacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16777216 + +[system.dir_cntrl0.L3triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.directory] +type=RubyDirectoryMemory +eventq_index=0 +numa_high_bit=5 +size=536870912 +version=0 + +[system.dir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[18] + +[system.dir_cntrl0.reqFromRegBuf] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[24] + +[system.dir_cntrl0.reqFromRegDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[25] + +[system.dir_cntrl0.reqToRegDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[20] + +[system.dir_cntrl0.requestFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[21] + +[system.dir_cntrl0.responseFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[22] + +[system.dir_cntrl0.responseFromMemory] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.dir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[19] + +[system.dir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.unblockFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[23] + +[system.dir_cntrl0.unblockToRegDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[21] + +[system.dispatcher_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.dispatcher_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.dispatcher_tlb.slave[0] +slave=system.cpu2.translation_port + +[system.dispatcher_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain + +[system.dispatcher_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dispatcher_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.dispatcher_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[1] +slave=system.dispatcher_coalescer.master[0] + +[system.dispatcher_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain + +[system.dispatcher_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.dvfs_handler] +type=DVFSHandler +domains= +enable=false +eventq_index=0 +sys_clk_domain=system.clk_domain +transition_latency=100000000 + +[system.l1_coalescer0] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer0.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb0.slave[0] +slave=system.cpu1.CUs0.translation_port[0] + +[system.l1_coalescer0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain + +[system.l1_coalescer0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_coalescer1] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l1_coalescer1.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l1_tlb1.slave[0] +slave=system.cpu1.CUs1.translation_port[0] + +[system.l1_coalescer1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain + +[system.l1_coalescer1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb0] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb0.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[2] +slave=system.l1_coalescer0.master[0] + +[system.l1_tlb0.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb0.clk_domain.voltage_domain + +[system.l1_tlb0.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l1_tlb1] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l1_tlb1.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[3] +slave=system.l1_coalescer1.master[0] + +[system.l1_tlb1.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l1_tlb1.clk_domain.voltage_domain + +[system.l1_tlb1.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l2_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l2_tlb.slave[0] +slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0] + +[system.l2_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_coalescer.clk_domain.voltage_domain + +[system.l2_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l2_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l2_tlb.clk_domain +eventq_index=0 +hitLatency=69 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=4096 +master=system.l3_coalescer.slave[0] +slave=system.l2_coalescer.master[0] + +[system.l2_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l2_tlb.clk_domain.voltage_domain + +[system.l2_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.l3_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.l3_tlb.slave[0] +slave=system.l2_tlb.master[0] + +[system.l3_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_coalescer.clk_domain.voltage_domain + +[system.l3_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.l3_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.l3_tlb.clk_domain +eventq_index=0 +hitLatency=150 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=8192 +slave=system.l3_coalescer.master[0] + +[system.l3_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.l3_tlb.clk_domain.voltage_domain + +[system.l3_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.mem_ctrls] +type=DRAMCtrl +IDD0=0.075000 +IDD02=0.000000 +IDD2N=0.050000 +IDD2N2=0.000000 +IDD2P0=0.000000 +IDD2P02=0.000000 +IDD2P1=0.000000 +IDD2P12=0.000000 +IDD3N=0.057000 +IDD3N2=0.000000 +IDD3P0=0.000000 +IDD3P02=0.000000 +IDD3P1=0.000000 +IDD3P12=0.000000 +IDD4R=0.187000 +IDD4R2=0.000000 +IDD4W=0.165000 +IDD4W2=0.000000 +IDD5=0.220000 +IDD52=0.000000 +IDD6=0.000000 +IDD62=0.000000 +VDD=1.500000 +VDD2=0.000000 +activation_limit=4 +addr_mapping=RoRaBaCoCh +bank_groups_per_rank=0 +banks_per_rank=8 +burst_length=8 +channels=1 +clk_domain=system.clk_domain +conf_table_reported=true +device_bus_width=8 +device_rowbuffer_size=1024 +device_size=536870912 +devices_per_rank=8 +dll=true +eventq_index=0 +in_addr_map=true +max_accesses_per_row=16 +mem_sched_policy=frfcfs +min_writes_per_switch=16 +null=false +page_policy=open_adaptive +range=0:536870911 +ranks_per_channel=2 +read_buffer_size=32 +static_backend_latency=10000 +static_frontend_latency=10000 +tBURST=5000 +tCCD_L=0 +tCK=1250 +tCL=13750 +tCS=2500 +tRAS=35000 +tRCD=13750 +tREFI=7800000 +tRFC=260000 +tRP=13750 +tRRD=6000 +tRRD_L=0 +tRTP=7500 +tRTW=2500 +tWR=15000 +tWTR=7500 +tXAW=30000 +tXP=0 +tXPDLL=0 +tXS=0 +tXSDLL=0 +write_buffer_size=64 +write_high_thresh_perc=85 +write_low_thresh_perc=50 +port=system.dir_cntrl0.memory + +[system.piobus] +type=NoncoherentXBar +clk_domain=system.clk_domain +eventq_index=0 +forward_latency=0 +frontend_latency=0 +response_latency=0 +use_default_range=false +width=32 +master=system.cpu2.pio +slave=system.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma + +[system.rb_cntrl0] +type=RegionBuffer_Controller +children=cacheMemory notifyFromRegionDir probeFromRegionDir requestFromCore requestToNetwork responseFromCore responseToRegDir triggerQueue unblockFromDir +TCC_select_num_bits=0 +blocksPerRegion=16 +buffer_size=0 +cacheMemory=system.rb_cntrl0.cacheMemory +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +isOnCPU=true +nextEvictLatency=1 +noTCCdir=true +notifyFromRegionDir=system.rb_cntrl0.notifyFromRegionDir +number_of_TBEs=256 +probeFromRegionDir=system.rb_cntrl0.probeFromRegionDir +recycle_latency=10 +requestFromCore=system.rb_cntrl0.requestFromCore +requestToNetwork=system.rb_cntrl0.requestToNetwork +responseFromCore=system.rb_cntrl0.responseFromCore +responseToRegDir=system.rb_cntrl0.responseToRegDir +ruby_system=system.ruby +system=system +toDirLatency=60 +toRegionDirLatency=120 +transitions_per_cycle=32 +triggerQueue=system.rb_cntrl0.triggerQueue +unblockFromDir=system.rb_cntrl0.unblockFromDir +version=0 + +[system.rb_cntrl0.cacheMemory] +type=RubyCache +children=replacement_policy +assoc=4 +block_size=1024 +dataAccessLatency=1 +dataArrayBanks=64 +eventq_index=0 +is_icache=false +replacement_policy=system.rb_cntrl0.cacheMemory.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=1048576 +start_index_bit=10 +tagAccessLatency=1 +tagArrayBanks=64 + +[system.rb_cntrl0.cacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=4 +block_size=64 +eventq_index=0 +size=1048576 + +[system.rb_cntrl0.notifyFromRegionDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[4] + +[system.rb_cntrl0.probeFromRegionDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[5] + +[system.rb_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[2] + +[system.rb_cntrl0.requestToNetwork] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[3] + +[system.rb_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[3] + +[system.rb_cntrl0.responseToRegDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[4] + +[system.rb_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.rb_cntrl0.unblockFromDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[6] + +[system.reg_cntrl0] +type=RegionDir_Controller +children=cacheMemory notifyToRBuffer probeToRBuffer requestFromRegBuf requestToDir responseFromRBuffer triggerQueue +TCC_select_num_bits=0 +always_migrate=false +asym_migrate=false +blocksPerRegion=16 +buffer_size=0 +cacheMemory=system.reg_cntrl0.cacheMemory +clk_domain=system.clk_domain +cluster_id=0 +cpuRegionBufferNum=0 +eventq_index=0 +gpuRegionBufferNum=1 +noTCCdir=true +notifyToRBuffer=system.reg_cntrl0.notifyToRBuffer +number_of_TBEs=32 +probeToRBuffer=system.reg_cntrl0.probeToRBuffer +recycle_latency=10 +requestFromRegBuf=system.reg_cntrl0.requestFromRegBuf +requestToDir=system.reg_cntrl0.requestToDir +responseFromRBuffer=system.reg_cntrl0.responseFromRBuffer +ruby_system=system.ruby +sym_migrate=false +system=system +toDirLatency=1 +transitions_per_cycle=32 +triggerQueue=system.reg_cntrl0.triggerQueue +version=0 + +[system.reg_cntrl0.cacheMemory] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=1024 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.reg_cntrl0.cacheMemory.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=2097152 +start_index_bit=10 +tagAccessLatency=4 +tagArrayBanks=8 + +[system.reg_cntrl0.cacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=2097152 + +[system.reg_cntrl0.notifyToRBuffer] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[23] + +[system.reg_cntrl0.probeToRBuffer] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[24] + +[system.reg_cntrl0.requestFromRegBuf] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[27] + +[system.reg_cntrl0.requestToDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[22] + +[system.reg_cntrl0.responseFromRBuffer] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[26] + +[system.reg_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby] +type=RubySystem +children=clk_domain network phys_mem +access_backing_store=true +all_instructions=false +block_size_bytes=64 +clk_domain=system.ruby.clk_domain +eventq_index=0 +hot_lines=false +memory_size_bits=48 +num_of_sequencers=5 +number_of_virtual_networks=10 +phys_mem=system.ruby.phys_mem +randomization=false + +[system.ruby.clk_domain] +type=SrcClockDomain +clock=500 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.ruby.network] +type=SimpleNetwork +children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 ext_links6 ext_links7 ext_links8 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1 +adaptive_routing=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +control_msg_size=8 +endpoint_bandwidth=1000 +eventq_index=0 +ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5 system.ruby.network.ext_links6 system.ruby.network.ext_links7 system.ruby.network.ext_links8 +int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39 +int_links=system.ruby.network.int_links0 system.ruby.network.int_links1 +netifs= +number_of_virtual_networks=10 +routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links2.int_node system.ruby.network.ext_links4.int_node +ruby_system=system.ruby +topology=Crossbar +master=system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.rb_cntrl0.requestFromCore.slave system.rb_cntrl0.responseFromCore.slave system.rb_cntrl0.notifyFromRegionDir.slave system.rb_cntrl0.probeFromRegionDir.slave system.rb_cntrl0.unblockFromDir.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.tcc_cntrl0.requestFromTCP.slave system.tcc_cntrl0.probeFromNB.slave system.tcc_cntrl0.responseFromNB.slave system.tcc_rb_cntrl0.requestFromCore.slave system.tcc_rb_cntrl0.responseFromCore.slave system.tcc_rb_cntrl0.notifyFromRegionDir.slave system.tcc_rb_cntrl0.probeFromRegionDir.slave system.tcc_rb_cntrl0.unblockFromDir.slave system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.dir_cntrl0.reqFromRegBuf.slave system.dir_cntrl0.reqFromRegDir.slave system.reg_cntrl0.responseFromRBuffer.slave system.reg_cntrl0.requestFromRegBuf.slave +slave=system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.rb_cntrl0.requestToNetwork.master system.rb_cntrl0.responseToRegDir.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.tcc_cntrl0.responseToCore.master system.tcc_cntrl0.requestToNB.master system.tcc_cntrl0.responseToNB.master system.tcc_cntrl0.unblockToNB.master system.tcc_rb_cntrl0.requestToNetwork.master system.tcc_rb_cntrl0.responseToRegDir.master system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.dir_cntrl0.reqToRegDir.master system.dir_cntrl0.unblockToRegDir.master system.reg_cntrl0.requestToDir.master system.reg_cntrl0.notifyToRBuffer.master system.reg_cntrl0.probeToRBuffer.master + +[system.ruby.network.ext_links0] +type=SimpleExtLink +children=int_node +bandwidth_factor=32 +eventq_index=0 +ext_node=system.dir_cntrl0 +int_node=system.ruby.network.ext_links0.int_node +latency=1 +link_id=0 +weight=1 + +[system.ruby.network.ext_links0.int_node] +type=Switch +children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links0.int_node.port_buffers000 system.ruby.network.ext_links0.int_node.port_buffers001 system.ruby.network.ext_links0.int_node.port_buffers002 system.ruby.network.ext_links0.int_node.port_buffers003 system.ruby.network.ext_links0.int_node.port_buffers004 system.ruby.network.ext_links0.int_node.port_buffers005 system.ruby.network.ext_links0.int_node.port_buffers006 system.ruby.network.ext_links0.int_node.port_buffers007 system.ruby.network.ext_links0.int_node.port_buffers008 system.ruby.network.ext_links0.int_node.port_buffers009 system.ruby.network.ext_links0.int_node.port_buffers010 system.ruby.network.ext_links0.int_node.port_buffers011 system.ruby.network.ext_links0.int_node.port_buffers012 system.ruby.network.ext_links0.int_node.port_buffers013 system.ruby.network.ext_links0.int_node.port_buffers014 system.ruby.network.ext_links0.int_node.port_buffers015 system.ruby.network.ext_links0.int_node.port_buffers016 system.ruby.network.ext_links0.int_node.port_buffers017 system.ruby.network.ext_links0.int_node.port_buffers018 system.ruby.network.ext_links0.int_node.port_buffers019 system.ruby.network.ext_links0.int_node.port_buffers020 system.ruby.network.ext_links0.int_node.port_buffers021 system.ruby.network.ext_links0.int_node.port_buffers022 system.ruby.network.ext_links0.int_node.port_buffers023 system.ruby.network.ext_links0.int_node.port_buffers024 system.ruby.network.ext_links0.int_node.port_buffers025 system.ruby.network.ext_links0.int_node.port_buffers026 system.ruby.network.ext_links0.int_node.port_buffers027 system.ruby.network.ext_links0.int_node.port_buffers028 system.ruby.network.ext_links0.int_node.port_buffers029 system.ruby.network.ext_links0.int_node.port_buffers030 system.ruby.network.ext_links0.int_node.port_buffers031 system.ruby.network.ext_links0.int_node.port_buffers032 system.ruby.network.ext_links0.int_node.port_buffers033 system.ruby.network.ext_links0.int_node.port_buffers034 system.ruby.network.ext_links0.int_node.port_buffers035 system.ruby.network.ext_links0.int_node.port_buffers036 system.ruby.network.ext_links0.int_node.port_buffers037 system.ruby.network.ext_links0.int_node.port_buffers038 system.ruby.network.ext_links0.int_node.port_buffers039 system.ruby.network.ext_links0.int_node.port_buffers040 system.ruby.network.ext_links0.int_node.port_buffers041 system.ruby.network.ext_links0.int_node.port_buffers042 system.ruby.network.ext_links0.int_node.port_buffers043 system.ruby.network.ext_links0.int_node.port_buffers044 system.ruby.network.ext_links0.int_node.port_buffers045 system.ruby.network.ext_links0.int_node.port_buffers046 system.ruby.network.ext_links0.int_node.port_buffers047 system.ruby.network.ext_links0.int_node.port_buffers048 system.ruby.network.ext_links0.int_node.port_buffers049 system.ruby.network.ext_links0.int_node.port_buffers050 system.ruby.network.ext_links0.int_node.port_buffers051 system.ruby.network.ext_links0.int_node.port_buffers052 system.ruby.network.ext_links0.int_node.port_buffers053 system.ruby.network.ext_links0.int_node.port_buffers054 system.ruby.network.ext_links0.int_node.port_buffers055 system.ruby.network.ext_links0.int_node.port_buffers056 system.ruby.network.ext_links0.int_node.port_buffers057 system.ruby.network.ext_links0.int_node.port_buffers058 system.ruby.network.ext_links0.int_node.port_buffers059 system.ruby.network.ext_links0.int_node.port_buffers060 system.ruby.network.ext_links0.int_node.port_buffers061 system.ruby.network.ext_links0.int_node.port_buffers062 system.ruby.network.ext_links0.int_node.port_buffers063 system.ruby.network.ext_links0.int_node.port_buffers064 system.ruby.network.ext_links0.int_node.port_buffers065 system.ruby.network.ext_links0.int_node.port_buffers066 system.ruby.network.ext_links0.int_node.port_buffers067 system.ruby.network.ext_links0.int_node.port_buffers068 system.ruby.network.ext_links0.int_node.port_buffers069 system.ruby.network.ext_links0.int_node.port_buffers070 system.ruby.network.ext_links0.int_node.port_buffers071 system.ruby.network.ext_links0.int_node.port_buffers072 system.ruby.network.ext_links0.int_node.port_buffers073 system.ruby.network.ext_links0.int_node.port_buffers074 system.ruby.network.ext_links0.int_node.port_buffers075 system.ruby.network.ext_links0.int_node.port_buffers076 system.ruby.network.ext_links0.int_node.port_buffers077 system.ruby.network.ext_links0.int_node.port_buffers078 system.ruby.network.ext_links0.int_node.port_buffers079 system.ruby.network.ext_links0.int_node.port_buffers080 system.ruby.network.ext_links0.int_node.port_buffers081 system.ruby.network.ext_links0.int_node.port_buffers082 system.ruby.network.ext_links0.int_node.port_buffers083 system.ruby.network.ext_links0.int_node.port_buffers084 system.ruby.network.ext_links0.int_node.port_buffers085 system.ruby.network.ext_links0.int_node.port_buffers086 system.ruby.network.ext_links0.int_node.port_buffers087 system.ruby.network.ext_links0.int_node.port_buffers088 system.ruby.network.ext_links0.int_node.port_buffers089 system.ruby.network.ext_links0.int_node.port_buffers090 system.ruby.network.ext_links0.int_node.port_buffers091 system.ruby.network.ext_links0.int_node.port_buffers092 system.ruby.network.ext_links0.int_node.port_buffers093 system.ruby.network.ext_links0.int_node.port_buffers094 system.ruby.network.ext_links0.int_node.port_buffers095 system.ruby.network.ext_links0.int_node.port_buffers096 system.ruby.network.ext_links0.int_node.port_buffers097 system.ruby.network.ext_links0.int_node.port_buffers098 system.ruby.network.ext_links0.int_node.port_buffers099 system.ruby.network.ext_links0.int_node.port_buffers100 system.ruby.network.ext_links0.int_node.port_buffers101 system.ruby.network.ext_links0.int_node.port_buffers102 system.ruby.network.ext_links0.int_node.port_buffers103 system.ruby.network.ext_links0.int_node.port_buffers104 system.ruby.network.ext_links0.int_node.port_buffers105 system.ruby.network.ext_links0.int_node.port_buffers106 system.ruby.network.ext_links0.int_node.port_buffers107 system.ruby.network.ext_links0.int_node.port_buffers108 system.ruby.network.ext_links0.int_node.port_buffers109 +router_id=0 +virt_nets=10 + +[system.ruby.network.ext_links0.int_node.port_buffers000] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers001] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers002] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers003] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers004] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers005] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers006] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers007] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers008] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers009] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers010] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers011] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers012] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers013] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers014] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers015] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers016] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers017] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers018] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers019] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers020] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers021] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers022] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers023] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers024] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers025] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers026] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers027] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers028] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers029] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers030] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers031] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers032] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers033] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers034] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers035] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers036] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers037] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers038] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers039] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers040] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers041] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers042] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers043] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers044] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers045] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers046] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers047] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers048] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers049] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers050] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers051] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers052] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers053] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers054] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers055] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers056] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers057] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers058] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers059] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers060] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers061] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers062] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers063] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers064] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers065] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers066] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers067] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers068] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers069] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers070] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers071] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers072] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers073] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers074] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers075] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers076] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers077] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers078] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers079] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers080] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers081] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers082] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers083] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers084] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers085] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers086] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers087] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers088] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers089] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers090] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers091] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers092] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers093] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers094] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers095] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers096] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers097] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers098] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers099] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers100] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers101] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers102] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers103] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers104] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers105] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers106] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers107] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers108] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links0.int_node.port_buffers109] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links1] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.reg_cntrl0 +int_node=system.ruby.network.ext_links0.int_node +latency=1 +link_id=1 +weight=1 + +[system.ruby.network.ext_links2] +type=SimpleExtLink +children=int_node +bandwidth_factor=32 +eventq_index=0 +ext_node=system.cp_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=2 +weight=1 + +[system.ruby.network.ext_links2.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 port_buffers80 port_buffers81 port_buffers82 port_buffers83 port_buffers84 port_buffers85 port_buffers86 port_buffers87 port_buffers88 port_buffers89 port_buffers90 port_buffers91 port_buffers92 port_buffers93 port_buffers94 port_buffers95 port_buffers96 port_buffers97 port_buffers98 port_buffers99 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69 system.ruby.network.ext_links2.int_node.port_buffers70 system.ruby.network.ext_links2.int_node.port_buffers71 system.ruby.network.ext_links2.int_node.port_buffers72 system.ruby.network.ext_links2.int_node.port_buffers73 system.ruby.network.ext_links2.int_node.port_buffers74 system.ruby.network.ext_links2.int_node.port_buffers75 system.ruby.network.ext_links2.int_node.port_buffers76 system.ruby.network.ext_links2.int_node.port_buffers77 system.ruby.network.ext_links2.int_node.port_buffers78 system.ruby.network.ext_links2.int_node.port_buffers79 system.ruby.network.ext_links2.int_node.port_buffers80 system.ruby.network.ext_links2.int_node.port_buffers81 system.ruby.network.ext_links2.int_node.port_buffers82 system.ruby.network.ext_links2.int_node.port_buffers83 system.ruby.network.ext_links2.int_node.port_buffers84 system.ruby.network.ext_links2.int_node.port_buffers85 system.ruby.network.ext_links2.int_node.port_buffers86 system.ruby.network.ext_links2.int_node.port_buffers87 system.ruby.network.ext_links2.int_node.port_buffers88 system.ruby.network.ext_links2.int_node.port_buffers89 system.ruby.network.ext_links2.int_node.port_buffers90 system.ruby.network.ext_links2.int_node.port_buffers91 system.ruby.network.ext_links2.int_node.port_buffers92 system.ruby.network.ext_links2.int_node.port_buffers93 system.ruby.network.ext_links2.int_node.port_buffers94 system.ruby.network.ext_links2.int_node.port_buffers95 system.ruby.network.ext_links2.int_node.port_buffers96 system.ruby.network.ext_links2.int_node.port_buffers97 system.ruby.network.ext_links2.int_node.port_buffers98 system.ruby.network.ext_links2.int_node.port_buffers99 +router_id=1 +virt_nets=10 + +[system.ruby.network.ext_links2.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers80] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers81] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers82] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers83] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers84] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers85] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers86] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers87] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers88] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers89] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers90] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers91] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers92] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers93] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers94] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers95] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers96] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers97] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers98] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links2.int_node.port_buffers99] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links3] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.rb_cntrl0 +int_node=system.ruby.network.ext_links2.int_node +latency=1 +link_id=3 +weight=1 + +[system.ruby.network.ext_links4] +type=SimpleExtLink +children=int_node +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcp_cntrl0 +int_node=system.ruby.network.ext_links4.int_node +latency=1 +link_id=4 +weight=1 + +[system.ruby.network.ext_links4.int_node] +type=Switch +children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 port_buffers80 port_buffers81 port_buffers82 port_buffers83 port_buffers84 port_buffers85 port_buffers86 port_buffers87 port_buffers88 port_buffers89 port_buffers90 port_buffers91 port_buffers92 port_buffers93 port_buffers94 port_buffers95 port_buffers96 port_buffers97 port_buffers98 port_buffers99 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links4.int_node.port_buffers00 system.ruby.network.ext_links4.int_node.port_buffers01 system.ruby.network.ext_links4.int_node.port_buffers02 system.ruby.network.ext_links4.int_node.port_buffers03 system.ruby.network.ext_links4.int_node.port_buffers04 system.ruby.network.ext_links4.int_node.port_buffers05 system.ruby.network.ext_links4.int_node.port_buffers06 system.ruby.network.ext_links4.int_node.port_buffers07 system.ruby.network.ext_links4.int_node.port_buffers08 system.ruby.network.ext_links4.int_node.port_buffers09 system.ruby.network.ext_links4.int_node.port_buffers10 system.ruby.network.ext_links4.int_node.port_buffers11 system.ruby.network.ext_links4.int_node.port_buffers12 system.ruby.network.ext_links4.int_node.port_buffers13 system.ruby.network.ext_links4.int_node.port_buffers14 system.ruby.network.ext_links4.int_node.port_buffers15 system.ruby.network.ext_links4.int_node.port_buffers16 system.ruby.network.ext_links4.int_node.port_buffers17 system.ruby.network.ext_links4.int_node.port_buffers18 system.ruby.network.ext_links4.int_node.port_buffers19 system.ruby.network.ext_links4.int_node.port_buffers20 system.ruby.network.ext_links4.int_node.port_buffers21 system.ruby.network.ext_links4.int_node.port_buffers22 system.ruby.network.ext_links4.int_node.port_buffers23 system.ruby.network.ext_links4.int_node.port_buffers24 system.ruby.network.ext_links4.int_node.port_buffers25 system.ruby.network.ext_links4.int_node.port_buffers26 system.ruby.network.ext_links4.int_node.port_buffers27 system.ruby.network.ext_links4.int_node.port_buffers28 system.ruby.network.ext_links4.int_node.port_buffers29 system.ruby.network.ext_links4.int_node.port_buffers30 system.ruby.network.ext_links4.int_node.port_buffers31 system.ruby.network.ext_links4.int_node.port_buffers32 system.ruby.network.ext_links4.int_node.port_buffers33 system.ruby.network.ext_links4.int_node.port_buffers34 system.ruby.network.ext_links4.int_node.port_buffers35 system.ruby.network.ext_links4.int_node.port_buffers36 system.ruby.network.ext_links4.int_node.port_buffers37 system.ruby.network.ext_links4.int_node.port_buffers38 system.ruby.network.ext_links4.int_node.port_buffers39 system.ruby.network.ext_links4.int_node.port_buffers40 system.ruby.network.ext_links4.int_node.port_buffers41 system.ruby.network.ext_links4.int_node.port_buffers42 system.ruby.network.ext_links4.int_node.port_buffers43 system.ruby.network.ext_links4.int_node.port_buffers44 system.ruby.network.ext_links4.int_node.port_buffers45 system.ruby.network.ext_links4.int_node.port_buffers46 system.ruby.network.ext_links4.int_node.port_buffers47 system.ruby.network.ext_links4.int_node.port_buffers48 system.ruby.network.ext_links4.int_node.port_buffers49 system.ruby.network.ext_links4.int_node.port_buffers50 system.ruby.network.ext_links4.int_node.port_buffers51 system.ruby.network.ext_links4.int_node.port_buffers52 system.ruby.network.ext_links4.int_node.port_buffers53 system.ruby.network.ext_links4.int_node.port_buffers54 system.ruby.network.ext_links4.int_node.port_buffers55 system.ruby.network.ext_links4.int_node.port_buffers56 system.ruby.network.ext_links4.int_node.port_buffers57 system.ruby.network.ext_links4.int_node.port_buffers58 system.ruby.network.ext_links4.int_node.port_buffers59 system.ruby.network.ext_links4.int_node.port_buffers60 system.ruby.network.ext_links4.int_node.port_buffers61 system.ruby.network.ext_links4.int_node.port_buffers62 system.ruby.network.ext_links4.int_node.port_buffers63 system.ruby.network.ext_links4.int_node.port_buffers64 system.ruby.network.ext_links4.int_node.port_buffers65 system.ruby.network.ext_links4.int_node.port_buffers66 system.ruby.network.ext_links4.int_node.port_buffers67 system.ruby.network.ext_links4.int_node.port_buffers68 system.ruby.network.ext_links4.int_node.port_buffers69 system.ruby.network.ext_links4.int_node.port_buffers70 system.ruby.network.ext_links4.int_node.port_buffers71 system.ruby.network.ext_links4.int_node.port_buffers72 system.ruby.network.ext_links4.int_node.port_buffers73 system.ruby.network.ext_links4.int_node.port_buffers74 system.ruby.network.ext_links4.int_node.port_buffers75 system.ruby.network.ext_links4.int_node.port_buffers76 system.ruby.network.ext_links4.int_node.port_buffers77 system.ruby.network.ext_links4.int_node.port_buffers78 system.ruby.network.ext_links4.int_node.port_buffers79 system.ruby.network.ext_links4.int_node.port_buffers80 system.ruby.network.ext_links4.int_node.port_buffers81 system.ruby.network.ext_links4.int_node.port_buffers82 system.ruby.network.ext_links4.int_node.port_buffers83 system.ruby.network.ext_links4.int_node.port_buffers84 system.ruby.network.ext_links4.int_node.port_buffers85 system.ruby.network.ext_links4.int_node.port_buffers86 system.ruby.network.ext_links4.int_node.port_buffers87 system.ruby.network.ext_links4.int_node.port_buffers88 system.ruby.network.ext_links4.int_node.port_buffers89 system.ruby.network.ext_links4.int_node.port_buffers90 system.ruby.network.ext_links4.int_node.port_buffers91 system.ruby.network.ext_links4.int_node.port_buffers92 system.ruby.network.ext_links4.int_node.port_buffers93 system.ruby.network.ext_links4.int_node.port_buffers94 system.ruby.network.ext_links4.int_node.port_buffers95 system.ruby.network.ext_links4.int_node.port_buffers96 system.ruby.network.ext_links4.int_node.port_buffers97 system.ruby.network.ext_links4.int_node.port_buffers98 system.ruby.network.ext_links4.int_node.port_buffers99 +router_id=2 +virt_nets=10 + +[system.ruby.network.ext_links4.int_node.port_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers40] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers41] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers42] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers43] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers44] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers45] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers46] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers47] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers48] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers49] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers50] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers51] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers52] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers53] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers54] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers55] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers56] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers57] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers58] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers59] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers60] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers61] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers62] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers63] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers64] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers65] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers66] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers67] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers68] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers69] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers70] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers71] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers72] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers73] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers74] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers75] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers76] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers77] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers78] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers79] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers80] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers81] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers82] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers83] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers84] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers85] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers86] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers87] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers88] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers89] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers90] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers91] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers92] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers93] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers94] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers95] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers96] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers97] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers98] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links4.int_node.port_buffers99] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links5] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcp_cntrl1 +int_node=system.ruby.network.ext_links4.int_node +latency=1 +link_id=5 +weight=1 + +[system.ruby.network.ext_links6] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.sqc_cntrl0 +int_node=system.ruby.network.ext_links4.int_node +latency=1 +link_id=6 +weight=1 + +[system.ruby.network.ext_links7] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcc_cntrl0 +int_node=system.ruby.network.ext_links4.int_node +latency=1 +link_id=7 +weight=1 + +[system.ruby.network.ext_links8] +type=SimpleExtLink +bandwidth_factor=32 +eventq_index=0 +ext_node=system.tcc_rb_cntrl0 +int_node=system.ruby.network.ext_links4.int_node +latency=1 +link_id=8 +weight=1 + +[system.ruby.network.int_link_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_links0] +type=SimpleIntLink +bandwidth_factor=32 +eventq_index=0 +latency=1 +link_id=0 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links2.int_node +weight=1 + +[system.ruby.network.int_links1] +type=SimpleIntLink +bandwidth_factor=32 +eventq_index=0 +latency=1 +link_id=1 +node_a=system.ruby.network.ext_links0.int_node +node_b=system.ruby.network.ext_links4.int_node +weight=1 + +[system.ruby.phys_mem] +type=SimpleMemory +bandwidth=73.000000 +clk_domain=system.ruby.clk_domain +conf_table_reported=true +eventq_index=0 +in_addr_map=false +latency=30000 +latency_var=0 +null=false +range=0:536870911 + +[system.sqc_cntrl0] +type=SQC_Controller +children=L1cache mandatoryQueue probeToSQC requestFromSQC responseToSQC sequencer +L1cache=system.sqc_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=80 +l2_hit_latency=18 +mandatoryQueue=system.sqc_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToSQC=system.sqc_cntrl0.probeToSQC +recycle_latency=10 +requestFromSQC=system.sqc_cntrl0.requestFromSQC +responseToSQC=system.sqc_cntrl0.responseToSQC +ruby_system=system.ruby +sequencer=system.sqc_cntrl0.sequencer +system=system +transitions_per_cycle=32 +version=0 + +[system.sqc_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=8 +eventq_index=0 +is_icache=false +replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=8 + +[system.sqc_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=32768 + +[system.sqc_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.sqc_cntrl0.probeToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[11] + +[system.sqc_cntrl0.requestFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[11] + +[system.sqc_cntrl0.responseToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[12] + +[system.sqc_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.sqc_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.sqc_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=false +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=6 +slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port + +[system.sqc_coalescer] +type=TLBCoalescer +children=clk_domain +clk_domain=system.sqc_coalescer.clk_domain +coalescingWindow=1 +disableCoalescing=false +eventq_index=0 +probesPerCycle=2 +master=system.sqc_tlb.slave[0] +slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port + +[system.sqc_coalescer.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain + +[system.sqc_coalescer.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sqc_tlb] +type=X86GPUTLB +children=clk_domain +accessDistance=false +allocationPolicy=true +assoc=32 +clk_domain=system.sqc_tlb.clk_domain +eventq_index=0 +hitLatency=1 +maxOutstandingReqs=64 +missLatency1=5 +missLatency2=750 +size=32 +master=system.l2_coalescer.slave[0] +slave=system.sqc_coalescer.master[0] + +[system.sqc_tlb.clk_domain] +type=SrcClockDomain +children=voltage_domain +clock=1000 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.sqc_tlb.clk_domain.voltage_domain + +[system.sqc_tlb.clk_domain.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + +[system.sys_port_proxy] +type=RubyPortProxy +clk_domain=system.clk_domain +eventq_index=0 +is_cpu_sequencer=true +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_ruby_tester=false +version=0 +slave=system.system_port + +[system.tcc_cntrl0] +type=TCC_Controller +children=L2cache probeFromNB requestFromTCP requestToNB responseFromNB responseToCore responseToNB triggerQueue unblockToNB +L2cache=system.tcc_cntrl0.L2cache +WB=false +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +l2_request_latency=1 +l2_response_latency=16 +number_of_TBEs=5120 +probeFromNB=system.tcc_cntrl0.probeFromNB +recycle_latency=10 +regionBufferNum=1 +requestFromTCP=system.tcc_cntrl0.requestFromTCP +requestToNB=system.tcc_cntrl0.requestToNB +responseFromNB=system.tcc_cntrl0.responseFromNB +responseToCore=system.tcc_cntrl0.responseToCore +responseToNB=system.tcc_cntrl0.responseToNB +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +triggerQueue=system.tcc_cntrl0.triggerQueue +unblockToNB=system.tcc_cntrl0.unblockToNB +version=0 + +[system.tcc_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=8 +dataArrayBanks=256 +eventq_index=0 +is_icache=false +replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=2097152 +start_index_bit=6 +tagAccessLatency=2 +tagArrayBanks=256 + +[system.tcc_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=2097152 + +[system.tcc_cntrl0.probeFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[14] + +[system.tcc_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[13] + +[system.tcc_cntrl0.requestToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[13] + +[system.tcc_cntrl0.responseFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[15] + +[system.tcc_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[12] + +[system.tcc_cntrl0.responseToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[14] + +[system.tcc_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.tcc_cntrl0.unblockToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[15] + +[system.tcc_rb_cntrl0] +type=RegionBuffer_Controller +children=cacheMemory notifyFromRegionDir probeFromRegionDir requestFromCore requestToNetwork responseFromCore responseToRegDir triggerQueue unblockFromDir +TCC_select_num_bits=0 +blocksPerRegion=16 +buffer_size=0 +cacheMemory=system.tcc_rb_cntrl0.cacheMemory +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +isOnCPU=false +nextEvictLatency=1 +noTCCdir=true +notifyFromRegionDir=system.tcc_rb_cntrl0.notifyFromRegionDir +number_of_TBEs=5120 +probeFromRegionDir=system.tcc_rb_cntrl0.probeFromRegionDir +recycle_latency=10 +requestFromCore=system.tcc_rb_cntrl0.requestFromCore +requestToNetwork=system.tcc_rb_cntrl0.requestToNetwork +responseFromCore=system.tcc_rb_cntrl0.responseFromCore +responseToRegDir=system.tcc_rb_cntrl0.responseToRegDir +ruby_system=system.ruby +system=system +toDirLatency=60 +toRegionDirLatency=120 +transitions_per_cycle=32 +triggerQueue=system.tcc_rb_cntrl0.triggerQueue +unblockFromDir=system.tcc_rb_cntrl0.unblockFromDir +version=1 + +[system.tcc_rb_cntrl0.cacheMemory] +type=RubyCache +children=replacement_policy +assoc=4 +block_size=1024 +dataAccessLatency=1 +dataArrayBanks=64 +eventq_index=0 +is_icache=false +replacement_policy=system.tcc_rb_cntrl0.cacheMemory.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=1048576 +start_index_bit=10 +tagAccessLatency=1 +tagArrayBanks=64 + +[system.tcc_rb_cntrl0.cacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=4 +block_size=64 +eventq_index=0 +size=1048576 + +[system.tcc_rb_cntrl0.notifyFromRegionDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[18] + +[system.tcc_rb_cntrl0.probeFromRegionDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[19] + +[system.tcc_rb_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[16] + +[system.tcc_rb_cntrl0.requestToNetwork] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[16] + +[system.tcc_rb_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[17] + +[system.tcc_rb_cntrl0.responseToRegDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[17] + +[system.tcc_rb_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.tcc_rb_cntrl0.unblockFromDir] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[20] + +[system.tcp_cntrl0] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl0.L1cache +TCC_select_num_bits=0 +WB=false +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl0.coalescer +disableL1=false +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl0.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl0.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl0.requestFromTCP +responseFromTCP=system.tcp_cntrl0.responseFromTCP +responseToTCP=system.tcp_cntrl0.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl0.unblockFromCore +use_seq_not_coal=false +version=0 + +[system.tcp_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.tcp_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl0.coalescer] +type=VIPERCoalescer +assume_rfo=false +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_inv_per_cycle=32 +max_outstanding_requests=2560 +max_wb_per_cycle=32 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=2 +slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63] + +[system.tcp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl0.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[7] + +[system.tcp_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[5] + +[system.tcp_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[6] + +[system.tcp_cntrl0.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[8] + +[system.tcp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=3 + +[system.tcp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[7] + +[system.tcp_cntrl1] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl1.L1cache +TCC_select_num_bits=0 +WB=false +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl1.coalescer +disableL1=false +eventq_index=0 +issue_latency=1 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl1.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl1.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl1.requestFromTCP +responseFromTCP=system.tcp_cntrl1.responseFromTCP +responseToTCP=system.tcp_cntrl1.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl1.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl1.unblockFromCore +use_seq_not_coal=false +version=1 + +[system.tcp_cntrl1.L1cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=16 + +[system.tcp_cntrl1.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl1.coalescer] +type=VIPERCoalescer +assume_rfo=false +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_inv_per_cycle=32 +max_outstanding_requests=2560 +max_wb_per_cycle=32 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=false +version=4 +slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63] + +[system.tcp_cntrl1.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl1.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[9] + +[system.tcp_cntrl1.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[8] + +[system.tcp_cntrl1.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[9] + +[system.tcp_cntrl1.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[10] + +[system.tcp_cntrl1.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=5 + +[system.tcp_cntrl1.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[10] + +[system.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simerr new file mode 100755 index 000000000..1e2b8911e --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simerr @@ -0,0 +1,5 @@ +warn: system.ruby.network adopting orphan SimObject param 'int_links' +warn: system.ruby.network adopting orphan SimObject param 'ext_links' +warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes) +warn: Sockets disabled, not accepting gdb connections +warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files! diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simout new file mode 100755 index 000000000..8e5806b46 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simout @@ -0,0 +1,21 @@ +gem5 Simulator System. http://gem5.org +gem5 is copyrighted software; use the --copyright option for details. + +gem5 compiled Jan 19 2016 13:45:43 +gem5 started Jan 19 2016 13:46:17 +gem5 executing on zizzer, pid 51290 +command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Region -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Region + +Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm +Global frequency set at 1000000000000 ticks per second +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +Forcing maxCoalescedReqs to 32 (TLB assoc.) +info: Entering event queue @ 0. Starting simulation... +keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23 +the gpu says: +elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe +Exiting @ tick 468854500 because target called exit() diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/stats.txt new file mode 100644 index 000000000..6fbd50886 --- /dev/null +++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/stats.txt @@ -0,0 +1,3418 @@ + +---------- Begin Simulation Statistics ---------- +sim_seconds 0.000469 # Number of seconds simulated +sim_ticks 468854500 # Number of ticks simulated +final_tick 468854500 # Number of ticks from beginning of simulation (restored from checkpoints and never reset) +sim_freq 1000000000000 # Frequency of simulated ticks +host_inst_rate 67943 # Simulator instruction rate (inst/s) +host_op_rate 139717 # Simulator op (including micro ops) rate (op/s) +host_tick_rate 475693968 # Simulator tick rate (ticks/s) +host_mem_usage 1301796 # Number of bytes of host memory used +host_seconds 0.99 # Real time elapsed on the host +sim_insts 66963 # Number of instructions simulated +sim_ops 137705 # Number of ops (including micro ops) simulated +system.voltage_domain.voltage 1 # Voltage in Volts +system.clk_domain.clock 1000 # Clock period in ticks +system.mem_ctrls.bytes_read::dir_cntrl0 100032 # Number of bytes read from this memory +system.mem_ctrls.bytes_read::total 100032 # Number of bytes read from this memory +system.mem_ctrls.num_reads::dir_cntrl0 1563 # Number of read requests responded to by this memory +system.mem_ctrls.num_reads::total 1563 # Number of read requests responded to by this memory +system.mem_ctrls.bw_read::dir_cntrl0 213354036 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_read::total 213354036 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_total::dir_cntrl0 213354036 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.bw_total::total 213354036 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.readReqs 1563 # Number of read requests accepted +system.mem_ctrls.writeReqs 0 # Number of write requests accepted +system.mem_ctrls.readBursts 1563 # Number of DRAM read bursts, including those serviced by the write queue +system.mem_ctrls.writeBursts 0 # Number of DRAM write bursts, including those merged in the write queue +system.mem_ctrls.bytesReadDRAM 100032 # Total number of bytes read from DRAM +system.mem_ctrls.bytesReadWrQ 0 # Total number of bytes read from write queue +system.mem_ctrls.bytesWritten 0 # Total number of bytes written to DRAM +system.mem_ctrls.bytesReadSys 100032 # Total read bytes from the system interface side +system.mem_ctrls.bytesWrittenSys 0 # Total written bytes from the system interface side +system.mem_ctrls.servicedByWrQ 0 # Number of DRAM read bursts serviced by the write queue +system.mem_ctrls.mergedWrBursts 0 # Number of DRAM write bursts merged with an existing one +system.mem_ctrls.neitherReadNorWriteReqs 0 # Number of requests that are neither read nor write +system.mem_ctrls.perBankRdBursts::0 122 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::1 192 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::2 93 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::3 44 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::4 61 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::5 79 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::6 52 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::7 42 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::8 54 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::9 56 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::10 183 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::11 90 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::12 225 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::13 125 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::14 51 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::15 94 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::0 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::1 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::2 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::3 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::4 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::5 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::6 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::7 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::8 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::9 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::10 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::11 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::12 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::13 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::14 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::15 0 # Per bank write bursts +system.mem_ctrls.numRdRetry 0 # Number of times read queue was full causing retry +system.mem_ctrls.numWrRetry 0 # Number of times write queue was full causing retry +system.mem_ctrls.totGap 468627000 # Total gap between requests +system.mem_ctrls.readPktSize::0 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::1 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::2 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::3 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::4 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::5 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::6 1563 # Read request sizes (log2) +system.mem_ctrls.writePktSize::0 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::1 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::2 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::3 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::4 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::5 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::6 0 # Write request sizes (log2) +system.mem_ctrls.rdQLenPdf::0 1548 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::1 4 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::2 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::3 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::4 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::5 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::6 2 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::7 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::8 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::9 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::10 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::11 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::12 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::13 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::14 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::15 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::16 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::17 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::18 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::19 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::20 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::21 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::22 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::23 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::24 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::25 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::26 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::27 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::28 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::29 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::30 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::31 0 # What read queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::0 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::1 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::2 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::3 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::4 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::5 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::6 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::7 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::8 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::9 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::10 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::11 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::12 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::13 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::14 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::15 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::16 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::17 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::18 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::19 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::20 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::21 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::22 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::23 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::24 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::25 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::26 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::27 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::28 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::29 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::30 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::31 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::32 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::33 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::34 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::35 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::36 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::37 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::38 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::39 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::40 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::41 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::42 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::43 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::44 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::45 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::46 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::47 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::48 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::49 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::50 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::51 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::52 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::53 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::54 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::55 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::56 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::57 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::58 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::59 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::60 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::61 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::62 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::63 0 # What write queue length does an incoming req see +system.mem_ctrls.bytesPerActivate::samples 450 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::mean 221.297778 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::gmean 151.217299 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::stdev 224.192300 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::0-127 165 36.67% 36.67% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::128-255 148 32.89% 69.56% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::256-383 55 12.22% 81.78% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::384-511 28 6.22% 88.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::512-639 19 4.22% 92.22% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::640-767 11 2.44% 94.67% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::768-895 8 1.78% 96.44% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::896-1023 6 1.33% 97.78% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::1024-1151 10 2.22% 100.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::total 450 # Bytes accessed per row activation +system.mem_ctrls.totQLat 14130749 # Total ticks spent queuing +system.mem_ctrls.totMemAccLat 43436999 # Total ticks spent from burst creation until serviced by the DRAM +system.mem_ctrls.totBusLat 7815000 # Total ticks spent in databus transfers +system.mem_ctrls.avgQLat 9040.79 # Average queueing delay per DRAM burst +system.mem_ctrls.avgBusLat 5000.00 # Average bus latency per DRAM burst +system.mem_ctrls.avgMemAccLat 27790.79 # Average memory access latency per DRAM burst +system.mem_ctrls.avgRdBW 213.35 # Average DRAM read bandwidth in MiByte/s +system.mem_ctrls.avgWrBW 0.00 # Average achieved write bandwidth in MiByte/s +system.mem_ctrls.avgRdBWSys 213.35 # Average system read bandwidth in MiByte/s +system.mem_ctrls.avgWrBWSys 0.00 # Average system write bandwidth in MiByte/s +system.mem_ctrls.peakBW 12800.00 # Theoretical peak bandwidth in MiByte/s +system.mem_ctrls.busUtil 1.67 # Data bus utilization in percentage +system.mem_ctrls.busUtilRead 1.67 # Data bus utilization in percentage for reads +system.mem_ctrls.busUtilWrite 0.00 # Data bus utilization in percentage for writes +system.mem_ctrls.avgRdQLen 1.01 # Average read queue length when enqueuing +system.mem_ctrls.avgWrQLen 0.00 # Average write queue length when enqueuing +system.mem_ctrls.readRowHits 1109 # Number of row buffer hits during reads +system.mem_ctrls.writeRowHits 0 # Number of row buffer hits during writes +system.mem_ctrls.readRowHitRate 70.95 # Row buffer hit rate for reads +system.mem_ctrls.writeRowHitRate nan # Row buffer hit rate for writes +system.mem_ctrls.avgGap 299825.34 # Average gap between requests +system.mem_ctrls.pageHitRate 70.95 # Row buffer hit rate, read and write combined +system.mem_ctrls_0.actEnergy 1300320 # Energy for activate commands per rank (pJ) +system.mem_ctrls_0.preEnergy 709500 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_0.readEnergy 5335200 # Energy for read commands per rank (pJ) +system.mem_ctrls_0.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_0.refreshEnergy 30513600 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_0.actBackEnergy 265391145 # Energy for active background per rank (pJ) +system.mem_ctrls_0.preBackEnergy 47661750 # Energy for precharge background per rank (pJ) +system.mem_ctrls_0.totalEnergy 350911515 # Total energy per rank (pJ) +system.mem_ctrls_0.averagePower 750.717244 # Core power per rank (mW) +system.mem_ctrls_0.memoryStateTime::IDLE 79008000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::REF 15600000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT 374147000 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT_PDN 0 # Time in different power states +system.mem_ctrls_1.actEnergy 2101680 # Energy for activate commands per rank (pJ) +system.mem_ctrls_1.preEnergy 1146750 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_1.readEnergy 6801600 # Energy for read commands per rank (pJ) +system.mem_ctrls_1.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_1.refreshEnergy 30513600 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_1.actBackEnergy 276170130 # Energy for active background per rank (pJ) +system.mem_ctrls_1.preBackEnergy 38206500 # Energy for precharge background per rank (pJ) +system.mem_ctrls_1.totalEnergy 354940260 # Total energy per rank (pJ) +system.mem_ctrls_1.averagePower 759.336079 # Core power per rank (mW) +system.mem_ctrls_1.memoryStateTime::IDLE 61948750 # Time in different power states +system.mem_ctrls_1.memoryStateTime::REF 15600000 # Time in different power states +system.mem_ctrls_1.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT 389900000 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT_PDN 0 # Time in different power states +system.ruby.clk_domain.clock 500 # Clock period in ticks +system.ruby.phys_mem.bytes_read::cpu0.inst 696760 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu0.data 119832 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit 3280 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_read::total 823152 # Number of bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu0.inst 696760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit 2000 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_inst_read::total 700760 # Number of instructions bytes read from this memory +system.ruby.phys_mem.bytes_written::cpu0.data 72767 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit 256 # Number of bytes written to this memory +system.ruby.phys_mem.bytes_written::total 73279 # Number of bytes written to this memory +system.ruby.phys_mem.num_reads::cpu0.inst 87095 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu0.data 16686 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit 555 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_reads::total 104891 # Number of read requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu0.data 10422 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit 256 # Number of write requests responded to by this memory +system.ruby.phys_mem.num_writes::total 10934 # Number of write requests responded to by this memory +system.ruby.phys_mem.bw_read::cpu0.inst 1486090034 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu0.data 255584622 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit 6995774 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit 6995774 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_read::total 1755666203 # Total read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu0.inst 1486090034 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit 4265716 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit 4265716 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_inst_read::total 1494621466 # Instruction read bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu0.data 155201667 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit 546012 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit 546012 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_write::total 156293690 # Write bandwidth from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.inst 1486090034 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu0.data 410786289 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit 7541785 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit 7541785 # Total bandwidth to/from this memory (bytes/s) +system.ruby.phys_mem.bw_total::total 1911959894 # Total bandwidth to/from this memory (bytes/s) +system.ruby.outstanding_req_hist::bucket_size 1 +system.ruby.outstanding_req_hist::max_bucket 9 +system.ruby.outstanding_req_hist::samples 114203 +system.ruby.outstanding_req_hist::mean 1.000035 +system.ruby.outstanding_req_hist::gmean 1.000024 +system.ruby.outstanding_req_hist::stdev 0.005918 +system.ruby.outstanding_req_hist | 0 0.00% 0.00% | 114199 100.00% 100.00% | 4 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.outstanding_req_hist::total 114203 +system.ruby.latency_hist::bucket_size 64 +system.ruby.latency_hist::max_bucket 639 +system.ruby.latency_hist::samples 114203 +system.ruby.latency_hist::mean 3.070988 +system.ruby.latency_hist::gmean 1.072272 +system.ruby.latency_hist::stdev 18.192328 +system.ruby.latency_hist | 112654 98.64% 98.64% | 11 0.01% 98.65% | 1238 1.08% 99.74% | 266 0.23% 99.97% | 14 0.01% 99.98% | 12 0.01% 99.99% | 7 0.01% 100.00% | 1 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.latency_hist::total 114203 +system.ruby.hit_latency_hist::bucket_size 64 +system.ruby.hit_latency_hist::max_bucket 639 +system.ruby.hit_latency_hist::samples 1549 +system.ruby.hit_latency_hist::mean 152.827631 +system.ruby.hit_latency_hist::gmean 149.009432 +system.ruby.hit_latency_hist::stdev 40.628532 +system.ruby.hit_latency_hist | 0 0.00% 0.00% | 11 0.71% 0.71% | 1238 79.92% 80.63% | 266 17.17% 97.81% | 14 0.90% 98.71% | 12 0.77% 99.48% | 7 0.45% 99.94% | 1 0.06% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.hit_latency_hist::total 1549 +system.ruby.miss_latency_hist::bucket_size 2 +system.ruby.miss_latency_hist::max_bucket 19 +system.ruby.miss_latency_hist::samples 112654 +system.ruby.miss_latency_hist::mean 1.011824 +system.ruby.miss_latency_hist::gmean 1.001936 +system.ruby.miss_latency_hist::stdev 0.461184 +system.ruby.miss_latency_hist | 112580 99.93% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 0 0.00% 99.93% | 74 0.07% 100.00% +system.ruby.miss_latency_hist::total 112654 +system.ruby.L1Cache.incomplete_times 112580 +system.ruby.L2Cache.incomplete_times 74 +system.cp_cntrl0.L1D0cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D0cache.demand_misses 1556 # Number of cache demand misses +system.cp_cntrl0.L1D0cache.demand_accesses 1556 # Number of cache demand accesses +system.cp_cntrl0.L1D0cache.num_data_array_reads 16142 # number of data array reads +system.cp_cntrl0.L1D0cache.num_data_array_writes 11998 # number of data array writes +system.cp_cntrl0.L1D0cache.num_tag_array_reads 27136 # number of tag array reads +system.cp_cntrl0.L1D0cache.num_tag_array_writes 1431 # number of tag array writes +system.cp_cntrl0.L1D1cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D1cache.demand_misses 0 # Number of cache demand misses +system.cp_cntrl0.L1D1cache.demand_accesses 0 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1Icache.demand_misses 1287 # Number of cache demand misses +system.cp_cntrl0.L1Icache.demand_accesses 1287 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.num_data_array_reads 85994 # number of data array reads +system.cp_cntrl0.L1Icache.num_data_array_writes 67 # number of data array writes +system.cp_cntrl0.L1Icache.num_tag_array_reads 87697 # number of tag array reads +system.cp_cntrl0.L1Icache.num_tag_array_writes 67 # number of tag array writes +system.cp_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L2cache.demand_misses 1549 # Number of cache demand misses +system.cp_cntrl0.L2cache.demand_accesses 1549 # Number of cache demand accesses +system.cp_cntrl0.L2cache.num_data_array_reads 167 # number of data array reads +system.cp_cntrl0.L2cache.num_data_array_writes 11993 # number of data array writes +system.cp_cntrl0.L2cache.num_tag_array_reads 12092 # number of tag array reads +system.cp_cntrl0.L2cache.num_tag_array_writes 1694 # number of tag array writes +system.cpu0.clk_domain.clock 500 # Clock period in ticks +system.cpu0.apic_clk_domain.clock 8000 # Clock period in ticks +system.cpu0.workload.num_syscalls 21 # Number of system calls +system.cpu0.numCycles 937709 # number of cpu cycles simulated +system.cpu0.numWorkItemsStarted 0 # number of work items this cpu started +system.cpu0.numWorkItemsCompleted 0 # number of work items this cpu completed +system.cpu0.committedInsts 66963 # Number of instructions committed +system.cpu0.committedOps 137705 # Number of ops (including micro ops) committed +system.cpu0.num_int_alu_accesses 136380 # Number of integer alu accesses +system.cpu0.num_fp_alu_accesses 1279 # Number of float alu accesses +system.cpu0.num_func_calls 3196 # number of times a function call or return occured +system.cpu0.num_conditional_control_insts 12151 # number of instructions that are conditional controls +system.cpu0.num_int_insts 136380 # number of integer instructions +system.cpu0.num_fp_insts 1279 # number of float instructions +system.cpu0.num_int_register_reads 257490 # number of times the integer registers were read +system.cpu0.num_int_register_writes 110039 # number of times the integer registers were written +system.cpu0.num_fp_register_reads 1981 # number of times the floating registers were read +system.cpu0.num_fp_register_writes 981 # number of times the floating registers were written +system.cpu0.num_cc_register_reads 78262 # number of times the CC registers were read +system.cpu0.num_cc_register_writes 42183 # number of times the CC registers were written +system.cpu0.num_mem_refs 27198 # number of memory refs +system.cpu0.num_load_insts 16684 # Number of load instructions +system.cpu0.num_store_insts 10514 # Number of store instructions +system.cpu0.num_idle_cycles 7323.003984 # Number of idle cycles +system.cpu0.num_busy_cycles 930385.996016 # Number of busy cycles +system.cpu0.not_idle_fraction 0.992191 # Percentage of non-idle cycles +system.cpu0.idle_fraction 0.007809 # Percentage of idle cycles +system.cpu0.Branches 16199 # Number of branches fetched +system.cpu0.op_class::No_OpClass 615 0.45% 0.45% # Class of executed instruction +system.cpu0.op_class::IntAlu 108791 79.00% 79.45% # Class of executed instruction +system.cpu0.op_class::IntMult 13 0.01% 79.46% # Class of executed instruction +system.cpu0.op_class::IntDiv 138 0.10% 79.56% # Class of executed instruction +system.cpu0.op_class::FloatAdd 950 0.69% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::FloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAddAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShift 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdShiftAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAdd 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatAlu 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCmp 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatCvt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatDiv 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMisc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMult 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatMultAcc 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::SimdFloatSqrt 0 0.00% 80.25% # Class of executed instruction +system.cpu0.op_class::MemRead 16684 12.12% 92.36% # Class of executed instruction +system.cpu0.op_class::MemWrite 10514 7.64% 100.00% # Class of executed instruction +system.cpu0.op_class::IprAccess 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::InstPrefetch 0 0.00% 100.00% # Class of executed instruction +system.cpu0.op_class::total 137705 # Class of executed instruction +system.cpu1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.cpu1.clk_domain.clock 1000 # Clock period in ticks +system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies 271 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies 252 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies 243 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies 228 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue 4103 # number of cycles the CU issues nothing +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued 133 # number of cycles the CU issued at least one instruction +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 1359 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 382 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 338 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 302 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM 373 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM 26 # Number of cycles no instruction of specific type issued +system.cpu1.CUs0.ExecStage.spc::samples 4236 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::mean 0.033286 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::stdev 0.190882 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::0 4103 96.86% 96.86% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::1 126 2.97% 99.83% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::2 6 0.14% 99.98% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::3 1 0.02% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.spc::total 4236 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle 68 # number of CU transitions from active to idle +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples 68 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean 53.455882 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev 203.558231 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4 48 70.59% 70.59% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9 8 11.76% 82.35% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14 1 1.47% 83.82% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19 1 1.47% 85.29% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24 2 2.94% 88.24% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29 1 1.47% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75 0 0.00% 89.71% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows 7 10.29% 100.00% # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value 1317 # duration of idle periods in cycles +system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total 68 # duration of idle periods in cycles +system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs0.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs0.tlb_cycles -318202403000 # total number of cycles for all uncoalesced requests +system.cpu1.CUs0.avg_translation_latency -413787260.078023 # Avg. translation latency for data translations +system.cpu1.CUs0.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs0.lds_bank_access_cnt 54 # Total number of LDS bank accesses +system.cpu1.CUs0.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::mean 8 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::stdev 6.196773 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::10-11 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::12-13 4 66.67% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs0.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs0.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs0.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs0.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs0.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs0.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::mean 84.978723 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::stdev 240.114362 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::4-5 53 37.59% 46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::6-7 31 21.99% 68.79% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::8-9 3 2.13% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::10 1 0.71% 71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::overflows 40 28.37% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::max_value 1320 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs0.num_vec_ops_executed 6769 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs0.num_total_cycles 4236 # number of cycles the CU ran for +system.cpu1.CUs0.vpc 1.597970 # Vector Operations per cycle (this CU only) +system.cpu1.CUs0.ipc 0.033286 # Instructions per cycle (this CU only) +system.cpu1.CUs0.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::mean 48.007092 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::stdev 23.719942 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::9-12 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::13-16 36 25.53% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs0.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::mean 37.833333 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::stdev 27.064737 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::9-12 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::13-16 8 44.44% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs0.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::mean 19.500000 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::stdev 22.322634 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::9-12 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::13-16 4 66.67% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs0.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs0.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs0.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs0.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs0.num_completed_wfs 4 # number of completed wavefronts +system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies 276 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean 0.794872 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev 0.863880 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1 28 71.79% 71.79% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3 11 28.21% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total 39 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean 0.589744 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev 0.498310 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1 39 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total 39 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies 254 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies 251 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies 236 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean 0.852941 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev 0.857493 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1 24 70.59% 70.59% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3 10 29.41% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value 2 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total 34 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean 0.617647 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev 0.493270 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows 0 0.00% 0.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1 34 100.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows 0 0.00% 100.00% # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value 1 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total 34 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail 0 # number of times instructions are blocked due to VRF port availability +system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies 0 # number of times the wf's instructions are blocked due to WAW or WAR dependencies +system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies 0 # number of times the wf's instructions are blocked due to RAW dependencies +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev nan # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total 0 # number of executed instructions with N source register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev nan # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total 0 # number of executed instructions with N destination register operands +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean 5.813953 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev 2.683777 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1 0 0.00% 0.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2 8 18.60% 18.60% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3 8 18.60% 37.21% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4 1 2.33% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5 0 0.00% 39.53% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6 1 2.33% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7 0 0.00% 41.86% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8 25 58.14% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows 0 0.00% 100.00% # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value 2 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value 8 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total 43 # For each instruction fetch request recieved record how many instructions you got from it +system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue 4105 # number of cycles the CU issues nothing +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued 131 # number of cycles the CU issued at least one instruction +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0 30 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3 29 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM 18 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM 6 # Number of cycles at least one instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0 1525 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1 346 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2 363 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3 363 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM 363 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM 33 # Number of cycles no instruction of specific type issued +system.cpu1.CUs1.ExecStage.spc::samples 4236 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::mean 0.033286 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::stdev 0.194558 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::underflows 0 0.00% 0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::0 4105 96.91% 96.91% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::1 123 2.90% 99.81% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::2 6 0.14% 99.95% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::3 2 0.05% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::4 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::5 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::6 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::overflows 0 0.00% 100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::min_value 0 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::max_value 3 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.spc::total 4236 # Execution units active per cycle (Exec unit=SIMD,MemPipe) +system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle 74 # number of CU transitions from active to idle +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples 74 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean 51.891892 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev 210.095188 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows 0 0.00% 0.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4 56 75.68% 75.68% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9 7 9.46% 85.14% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14 0 0.00% 85.14% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19 2 2.70% 87.84% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24 1 1.35% 89.19% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29 1 1.35% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75 0 0.00% 90.54% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows 7 9.46% 100.00% # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value 1 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value 1321 # duration of idle periods in cycles +system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total 74 # duration of idle periods in cycles +system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles GM data are delayed before updating the VRF +system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles 0 # total number of cycles LDS data are delayed before updating the VRF +system.cpu1.CUs1.tlb_requests 769 # number of uncoalesced requests +system.cpu1.CUs1.tlb_cycles -318199598000 # total number of cycles for all uncoalesced requests +system.cpu1.CUs1.avg_translation_latency -413783612.483745 # Avg. translation latency for data translations +system.cpu1.CUs1.TLB_hits_distribution::page_table 769 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L1_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L2_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.TLB_hits_distribution::L3_TLB 0 # TLB hits distribution (0 for page table, x for Lx-TLB +system.cpu1.CUs1.lds_bank_access_cnt 53 # Total number of LDS bank accesses +system.cpu1.CUs1.lds_bank_conflicts::samples 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::mean 7.833333 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::stdev 6.080022 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::underflows 0 0.00% 0.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::0-1 2 33.33% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::2-3 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::4-5 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::6-7 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::8-9 0 0.00% 33.33% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::10-11 1 16.67% 50.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::12-13 3 50.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::14-15 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::16-17 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::18-19 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::20-21 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::22-23 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::24-25 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::26-27 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::28-29 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::30-31 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::32-33 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::34-35 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::36-37 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::38-39 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::40-41 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::42-43 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::44-45 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::46-47 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::48-49 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::50-51 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::52-53 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::54-55 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::56-57 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::58-59 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::60-61 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::62-63 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::64 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::overflows 0 0.00% 100.00% # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::min_value 0 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::max_value 12 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.lds_bank_conflicts::total 6 # Number of bank conflicts per LDS memory packet +system.cpu1.CUs1.page_divergence_dist::samples 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::mean 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::stdev 0 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::underflows 0 0.00% 0.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::1-4 17 100.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::5-8 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::9-12 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::13-16 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::17-20 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::21-24 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::25-28 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::29-32 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::33-36 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::37-40 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::41-44 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::45-48 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::49-52 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::53-56 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::57-60 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::61-64 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::overflows 0 0.00% 100.00% # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::min_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::max_value 1 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.page_divergence_dist::total 17 # pages touched per wf (over all mem. instr.) +system.cpu1.CUs1.global_mem_instr_cnt 17 # dynamic global memory instructions count +system.cpu1.CUs1.local_mem_instr_cnt 6 # dynamic local memory intruction count +system.cpu1.CUs1.wg_blocked_due_lds_alloc 0 # Workgroup blocked due to LDS capacity +system.cpu1.CUs1.num_instr_executed 141 # number of instructions executed +system.cpu1.CUs1.inst_exec_rate::samples 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::mean 86.326241 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::stdev 246.713874 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::underflows 0 0.00% 0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::0-1 1 0.71% 0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::2-3 12 8.51% 9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::4-5 53 37.59% 46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::6-7 29 20.57% 67.38% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::8-9 5 3.55% 70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::10 1 0.71% 71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::overflows 40 28.37% 100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::min_value 1 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::max_value 1324 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.inst_exec_rate::total 141 # Instruction Execution Rate: Number of executed vector instructions per cycle +system.cpu1.CUs1.num_vec_ops_executed 6762 # number of vec ops executed (e.g. VSZ/inst) +system.cpu1.CUs1.num_total_cycles 4236 # number of cycles the CU ran for +system.cpu1.CUs1.vpc 1.596317 # Vector Operations per cycle (this CU only) +system.cpu1.CUs1.ipc 0.033286 # Instructions per cycle (this CU only) +system.cpu1.CUs1.warp_execution_dist::samples 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::mean 47.957447 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::stdev 23.818022 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::underflows 0 0.00% 0.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::1-4 5 3.55% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::5-8 0 0.00% 3.55% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::9-12 9 6.38% 9.93% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::13-16 27 19.15% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::17-20 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::21-24 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::25-28 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::29-32 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::33-36 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::37-40 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::41-44 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::45-48 0 0.00% 29.08% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::49-52 8 5.67% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::53-56 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::57-60 0 0.00% 34.75% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::61-64 92 65.25% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::overflows 0 0.00% 100.00% # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::min_value 1 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::max_value 64 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.warp_execution_dist::total 141 # number of lanes active per instruction (oval all instructions) +system.cpu1.CUs1.gmem_lanes_execution_dist::samples 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::mean 37.722222 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::stdev 27.174394 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::1-4 1 5.56% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::5-8 0 0.00% 5.56% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::9-12 2 11.11% 16.67% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::13-16 6 33.33% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::17-20 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::21-24 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::25-28 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::29-32 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::33-36 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::37-40 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::41-44 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::45-48 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::49-52 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::53-56 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::57-60 0 0.00% 50.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::61-64 9 50.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::min_value 1 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::max_value 64 # number of active lanes per global memory instruction +system.cpu1.CUs1.gmem_lanes_execution_dist::total 18 # number of active lanes per global memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::samples 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::mean 19.333333 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::stdev 22.384518 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::underflows 0 0.00% 0.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::1-4 1 16.67% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::5-8 0 0.00% 16.67% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::9-12 1 16.67% 33.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::13-16 3 50.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::17-20 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::21-24 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::25-28 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::29-32 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::33-36 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::37-40 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::41-44 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::45-48 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::49-52 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::53-56 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::57-60 0 0.00% 83.33% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::61-64 1 16.67% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::overflows 0 0.00% 100.00% # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::min_value 1 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::max_value 64 # number of active lanes per local memory instruction +system.cpu1.CUs1.lmem_lanes_execution_dist::total 6 # number of active lanes per local memory instruction +system.cpu1.CUs1.num_alu_insts_executed 118 # Number of dynamic non-GM memory insts executed +system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc 0 # Number of times WGs are blocked due to VGPR allocation per SIMD +system.cpu1.CUs1.num_CAS_ops 0 # number of compare and swap operations +system.cpu1.CUs1.num_failed_CAS_ops 0 # number of compare and swap operations that failed +system.cpu1.CUs1.num_completed_wfs 4 # number of completed wavefronts +system.cpu2.num_kernel_launched 1 # number of kernel launched +system.dir_cntrl0.L3CacheMemory.demand_hits 0 # Number of cache demand hits +system.dir_cntrl0.L3CacheMemory.demand_misses 0 # Number of cache demand misses +system.dir_cntrl0.L3CacheMemory.demand_accesses 0 # Number of cache demand accesses +system.dir_cntrl0.L3CacheMemory.num_data_array_writes 1600 # number of data array writes +system.dir_cntrl0.L3CacheMemory.num_tag_array_reads 1602 # number of tag array reads +system.dir_cntrl0.L3CacheMemory.num_tag_array_writes 1572 # number of tag array writes +system.dispatcher_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_coalescer.uncoalesced_accesses 0 # Number of uncoalesced TLB accesses +system.dispatcher_coalescer.coalesced_accesses 0 # Number of coalesced TLB accesses +system.dispatcher_coalescer.queuing_cycles 0 # Number of cycles spent in queue +system.dispatcher_coalescer.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_coalescer.local_latency nan # Avg. latency over all incoming pkts +system.dispatcher_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.dispatcher_tlb.clk_domain.clock 1000 # Clock period in ticks +system.dispatcher_tlb.local_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.local_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.local_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.local_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.global_TLB_accesses 0 # Number of TLB accesses +system.dispatcher_tlb.global_TLB_hits 0 # Number of TLB hits +system.dispatcher_tlb.global_TLB_misses 0 # Number of TLB misses +system.dispatcher_tlb.global_TLB_miss_rate nan # TLB miss rate +system.dispatcher_tlb.access_cycles 0 # Cycles spent accessing this TLB level +system.dispatcher_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.dispatcher_tlb.unique_pages 0 # Number of unique pages touched +system.dispatcher_tlb.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.dispatcher_tlb.local_latency nan # Avg. latency over incoming coalesced reqs +system.dispatcher_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_coalescer0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer0.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer0.uncoalesced_accesses 778 # Number of uncoalesced TLB accesses +system.l1_coalescer0.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer0.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer0.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer0.local_latency 0 # Avg. latency over all incoming pkts +system.l1_coalescer1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_coalescer1.clk_domain.clock 1000 # Clock period in ticks +system.l1_coalescer1.uncoalesced_accesses 769 # Number of uncoalesced TLB accesses +system.l1_coalescer1.coalesced_accesses 0 # Number of coalesced TLB accesses +system.l1_coalescer1.queuing_cycles 0 # Number of cycles spent in queue +system.l1_coalescer1.local_queuing_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_coalescer1.local_latency 0 # Avg. latency over all incoming pkts +system.l1_tlb0.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb0.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb0.local_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.local_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.local_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.local_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.global_TLB_accesses 778 # Number of TLB accesses +system.l1_tlb0.global_TLB_hits 774 # Number of TLB hits +system.l1_tlb0.global_TLB_misses 4 # Number of TLB misses +system.l1_tlb0.global_TLB_miss_rate 0.514139 # TLB miss rate +system.l1_tlb0.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb0.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb0.unique_pages 4 # Number of unique pages touched +system.l1_tlb0.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb0.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb0.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l1_tlb1.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l1_tlb1.clk_domain.clock 1000 # Clock period in ticks +system.l1_tlb1.local_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.local_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.local_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.local_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.global_TLB_accesses 769 # Number of TLB accesses +system.l1_tlb1.global_TLB_hits 766 # Number of TLB hits +system.l1_tlb1.global_TLB_misses 3 # Number of TLB misses +system.l1_tlb1.global_TLB_miss_rate 0.390117 # TLB miss rate +system.l1_tlb1.access_cycles 0 # Cycles spent accessing this TLB level +system.l1_tlb1.page_table_cycles 0 # Cycles spent accessing the page table +system.l1_tlb1.unique_pages 3 # Number of unique pages touched +system.l1_tlb1.local_cycles 0 # Number of cycles spent in queue for all incoming reqs +system.l1_tlb1.local_latency 0 # Avg. latency over incoming coalesced reqs +system.l1_tlb1.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l2_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l2_coalescer.uncoalesced_accesses 8 # Number of uncoalesced TLB accesses +system.l2_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l2_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l2_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l2_coalescer.local_latency 125 # Avg. latency over all incoming pkts +system.l2_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l2_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l2_tlb.local_TLB_accesses 8 # Number of TLB accesses +system.l2_tlb.local_TLB_hits 3 # Number of TLB hits +system.l2_tlb.local_TLB_misses 5 # Number of TLB misses +system.l2_tlb.local_TLB_miss_rate 62.500000 # TLB miss rate +system.l2_tlb.global_TLB_accesses 15 # Number of TLB accesses +system.l2_tlb.global_TLB_hits 3 # Number of TLB hits +system.l2_tlb.global_TLB_misses 12 # Number of TLB misses +system.l2_tlb.global_TLB_miss_rate 80 # TLB miss rate +system.l2_tlb.access_cycles 552008 # Cycles spent accessing this TLB level +system.l2_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.l2_tlb.unique_pages 5 # Number of unique pages touched +system.l2_tlb.local_cycles 69001 # Number of cycles spent in queue for all incoming reqs +system.l2_tlb.local_latency 8625.125000 # Avg. latency over incoming coalesced reqs +system.l2_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.l3_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.l3_coalescer.uncoalesced_accesses 5 # Number of uncoalesced TLB accesses +system.l3_coalescer.coalesced_accesses 1 # Number of coalesced TLB accesses +system.l3_coalescer.queuing_cycles 8000 # Number of cycles spent in queue +system.l3_coalescer.local_queuing_cycles 1000 # Number of cycles spent in queue for all incoming reqs +system.l3_coalescer.local_latency 200 # Avg. latency over all incoming pkts +system.l3_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.l3_tlb.clk_domain.clock 1000 # Clock period in ticks +system.l3_tlb.local_TLB_accesses 5 # Number of TLB accesses +system.l3_tlb.local_TLB_hits 0 # Number of TLB hits +system.l3_tlb.local_TLB_misses 5 # Number of TLB misses +system.l3_tlb.local_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.global_TLB_accesses 12 # Number of TLB accesses +system.l3_tlb.global_TLB_hits 0 # Number of TLB hits +system.l3_tlb.global_TLB_misses 12 # Number of TLB misses +system.l3_tlb.global_TLB_miss_rate 100 # TLB miss rate +system.l3_tlb.access_cycles 1200000 # Cycles spent accessing this TLB level +system.l3_tlb.page_table_cycles 6000000 # Cycles spent accessing the page table +system.l3_tlb.unique_pages 5 # Number of unique pages touched +system.l3_tlb.local_cycles 150000 # Number of cycles spent in queue for all incoming reqs +system.l3_tlb.local_latency 30000 # Avg. latency over incoming coalesced reqs +system.l3_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.piobus.trans_dist::WriteReq 94 # Transaction distribution +system.piobus.trans_dist::WriteResp 94 # Transaction distribution +system.piobus.pkt_count_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_count::total 188 # Packet count per connected master and slave (bytes) +system.piobus.pkt_size_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.pkt_size::total 748 # Cumulative packet size per connected master and slave (bytes) +system.piobus.reqLayer0.occupancy 188000 # Layer occupancy (ticks) +system.piobus.reqLayer0.utilization 0.0 # Layer utilization (%) +system.piobus.respLayer0.occupancy 94000 # Layer occupancy (ticks) +system.piobus.respLayer0.utilization 0.0 # Layer utilization (%) +system.rb_cntrl0.cacheMemory.demand_hits 0 # Number of cache demand hits +system.rb_cntrl0.cacheMemory.demand_misses 0 # Number of cache demand misses +system.rb_cntrl0.cacheMemory.demand_accesses 0 # Number of cache demand accesses +system.rb_cntrl0.cacheMemory.num_tag_array_reads 1553 # number of tag array reads +system.rb_cntrl0.cacheMemory.num_tag_array_writes 3123 # number of tag array writes +system.reg_cntrl0.cacheMemory.demand_hits 0 # Number of cache demand hits +system.reg_cntrl0.cacheMemory.demand_misses 0 # Number of cache demand misses +system.reg_cntrl0.cacheMemory.demand_accesses 0 # Number of cache demand accesses +system.reg_cntrl0.cacheMemory.num_tag_array_reads 279 # number of tag array reads +system.reg_cntrl0.cacheMemory.num_tag_array_writes 279 # number of tag array writes +system.ruby.network.ext_links0.int_node.percent_links_utilized 0.122493 +system.ruby.network.ext_links0.int_node.msg_count.Data::0 16 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0 1558 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::5 279 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::7 279 +system.ruby.network.ext_links0.int_node.msg_count.Request_Control::8 8 +system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2 1577 +system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2 303 +system.ruby.network.ext_links0.int_node.msg_count.Response_Control::4 34 +system.ruby.network.ext_links0.int_node.msg_count.Writeback_Control::2 24 +system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4 1556 +system.ruby.network.ext_links0.int_node.msg_bytes.Data::0 1152 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0 12464 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::5 2232 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::7 2232 +system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::8 64 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2 113544 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2 2424 +system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::4 272 +system.ruby.network.ext_links0.int_node.msg_bytes.Writeback_Control::2 192 +system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4 12448 +system.ruby.network.ext_links2.int_node.percent_links_utilized 0.185852 +system.ruby.network.ext_links2.int_node.msg_count.Control::0 23 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0 3098 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::7 274 +system.ruby.network.ext_links2.int_node.msg_count.Request_Control::8 4 +system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2 1568 +system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2 281 +system.ruby.network.ext_links2.int_node.msg_count.Response_Control::4 23 +system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4 3098 +system.ruby.network.ext_links2.int_node.msg_bytes.Control::0 184 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0 24784 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::7 2192 +system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::8 32 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2 112896 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2 2248 +system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::4 184 +system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4 24784 +system.tcp_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl0.L1cache.num_data_array_reads 6 # number of data array reads +system.tcp_cntrl0.L1cache.num_data_array_writes 11 # number of data array writes +system.tcp_cntrl0.L1cache.num_tag_array_reads 1297 # number of tag array reads +system.tcp_cntrl0.L1cache.num_tag_array_writes 11 # number of tag array writes +system.tcp_cntrl0.L1cache.num_tag_array_stalls 1271 # number of stalls caused by tag array +system.tcp_cntrl0.L1cache.num_data_array_stalls 2 # number of stalls caused by data array +system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_ld_misses 5 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.gpu_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_st_misses 9 # stores that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.ruby.network.ext_links4.int_node.percent_links_utilized 0.003510 +system.ruby.network.ext_links4.int_node.msg_count.Control::0 11 +system.ruby.network.ext_links4.int_node.msg_count.Data::0 34 +system.ruby.network.ext_links4.int_node.msg_count.Data::1 18 +system.ruby.network.ext_links4.int_node.msg_count.Request_Control::0 16 +system.ruby.network.ext_links4.int_node.msg_count.Request_Control::1 9 +system.ruby.network.ext_links4.int_node.msg_count.Request_Control::7 5 +system.ruby.network.ext_links4.int_node.msg_count.Request_Control::8 4 +system.ruby.network.ext_links4.int_node.msg_count.Response_Data::2 9 +system.ruby.network.ext_links4.int_node.msg_count.Response_Data::3 11 +system.ruby.network.ext_links4.int_node.msg_count.Response_Control::2 22 +system.ruby.network.ext_links4.int_node.msg_count.Response_Control::4 11 +system.ruby.network.ext_links4.int_node.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links4.int_node.msg_count.Writeback_Control::3 16 +system.ruby.network.ext_links4.int_node.msg_count.Unblock_Control::4 32 +system.ruby.network.ext_links4.int_node.msg_bytes.Control::0 88 +system.ruby.network.ext_links4.int_node.msg_bytes.Data::0 2448 +system.ruby.network.ext_links4.int_node.msg_bytes.Data::1 1296 +system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::0 128 +system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::1 72 +system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::7 40 +system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::8 32 +system.ruby.network.ext_links4.int_node.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links4.int_node.msg_bytes.Response_Data::3 792 +system.ruby.network.ext_links4.int_node.msg_bytes.Response_Control::2 176 +system.ruby.network.ext_links4.int_node.msg_bytes.Response_Control::4 88 +system.ruby.network.ext_links4.int_node.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links4.int_node.msg_bytes.Writeback_Control::3 128 +system.ruby.network.ext_links4.int_node.msg_bytes.Unblock_Control::4 256 +system.tcp_cntrl1.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl1.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl1.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl1.L1cache.num_data_array_reads 6 # number of data array reads +system.tcp_cntrl1.L1cache.num_data_array_writes 11 # number of data array writes +system.tcp_cntrl1.L1cache.num_tag_array_reads 1297 # number of tag array reads +system.tcp_cntrl1.L1cache.num_tag_array_writes 11 # number of tag array writes +system.tcp_cntrl1.L1cache.num_tag_array_stalls 1271 # number of stalls caused by tag array +system.tcp_cntrl1.L1cache.num_data_array_stalls 2 # number of stalls caused by data array +system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_ld_misses 5 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.gpu_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_st_misses 9 # stores that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.sqc_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.sqc_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.sqc_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.sqc_cntrl0.L1cache.num_data_array_reads 86 # number of data array reads +system.sqc_cntrl0.L1cache.num_tag_array_reads 91 # number of tag array reads +system.sqc_cntrl0.L1cache.num_tag_array_writes 10 # number of tag array writes +system.sqc_cntrl0.sequencer.load_waiting_on_load 98 # Number of times a load aliased with a pending load +system.tcc_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.tcc_cntrl0.L2cache.demand_misses 0 # Number of cache demand misses +system.tcc_cntrl0.L2cache.demand_accesses 0 # Number of cache demand accesses +system.tcc_cntrl0.L2cache.num_data_array_writes 9 # number of data array writes +system.tcc_cntrl0.L2cache.num_tag_array_reads 45 # number of tag array reads +system.tcc_cntrl0.L2cache.num_tag_array_writes 21 # number of tag array writes +system.tcc_rb_cntrl0.cacheMemory.demand_hits 0 # Number of cache demand hits +system.tcc_rb_cntrl0.cacheMemory.demand_misses 0 # Number of cache demand misses +system.tcc_rb_cntrl0.cacheMemory.demand_accesses 0 # Number of cache demand accesses +system.tcc_rb_cntrl0.cacheMemory.num_tag_array_reads 29 # number of tag array reads +system.tcc_rb_cntrl0.cacheMemory.num_tag_array_writes 89 # number of tag array writes +system.tcc_rb_cntrl0.cacheMemory.num_tag_array_stalls 20 # number of stalls caused by tag array +system.ruby.network.msg_count.Control 34 +system.ruby.network.msg_count.Data 68 +system.ruby.network.msg_count.Request_Control 5534 +system.ruby.network.msg_count.Response_Data 3165 +system.ruby.network.msg_count.Response_Control 674 +system.ruby.network.msg_count.Writeback_Control 56 +system.ruby.network.msg_count.Unblock_Control 4686 +system.ruby.network.msg_byte.Control 272 +system.ruby.network.msg_byte.Data 4896 +system.ruby.network.msg_byte.Request_Control 44272 +system.ruby.network.msg_byte.Response_Data 227880 +system.ruby.network.msg_byte.Response_Control 5392 +system.ruby.network.msg_byte.Writeback_Control 448 +system.ruby.network.msg_byte.Unblock_Control 37488 +system.sqc_coalescer.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_coalescer.clk_domain.clock 1000 # Clock period in ticks +system.sqc_coalescer.uncoalesced_accesses 86 # Number of uncoalesced TLB accesses +system.sqc_coalescer.coalesced_accesses 66 # Number of coalesced TLB accesses +system.sqc_coalescer.queuing_cycles 288000 # Number of cycles spent in queue +system.sqc_coalescer.local_queuing_cycles 288000 # Number of cycles spent in queue for all incoming reqs +system.sqc_coalescer.local_latency 3348.837209 # Avg. latency over all incoming pkts +system.sqc_tlb.clk_domain.voltage_domain.voltage 1 # Voltage in Volts +system.sqc_tlb.clk_domain.clock 1000 # Clock period in ticks +system.sqc_tlb.local_TLB_accesses 66 # Number of TLB accesses +system.sqc_tlb.local_TLB_hits 65 # Number of TLB hits +system.sqc_tlb.local_TLB_misses 1 # Number of TLB misses +system.sqc_tlb.local_TLB_miss_rate 1.515152 # TLB miss rate +system.sqc_tlb.global_TLB_accesses 86 # Number of TLB accesses +system.sqc_tlb.global_TLB_hits 78 # Number of TLB hits +system.sqc_tlb.global_TLB_misses 8 # Number of TLB misses +system.sqc_tlb.global_TLB_miss_rate 9.302326 # TLB miss rate +system.sqc_tlb.access_cycles 86008 # Cycles spent accessing this TLB level +system.sqc_tlb.page_table_cycles 0 # Cycles spent accessing the page table +system.sqc_tlb.unique_pages 1 # Number of unique pages touched +system.sqc_tlb.local_cycles 66001 # Number of cycles spent in queue for all incoming reqs +system.sqc_tlb.local_latency 1000.015152 # Avg. latency over incoming coalesced reqs +system.sqc_tlb.avg_reuse_distance 0 # avg. reuse distance over all pages (in ticks) +system.ruby.network.ext_links0.int_node.throttle0.link_utilization 0.091873 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Data::0 16 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0 1279 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::5 279 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2 19 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2 17 +system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4 1556 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Data::0 1152 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0 10232 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::5 2232 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2 1368 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2 136 +system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4 12448 +system.ruby.network.ext_links0.int_node.throttle1.link_utilization 0.015277 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Request_Control::0 279 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Control::2 286 +system.ruby.network.ext_links0.int_node.throttle1.msg_count.Writeback_Control::2 8 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Request_Control::0 2232 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Control::2 2288 +system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Writeback_Control::2 64 +system.ruby.network.ext_links0.int_node.throttle2.link_utilization 0.379702 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Request_Control::7 274 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Request_Control::8 4 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2 1549 +system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Control::4 23 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Request_Control::7 2192 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Request_Control::8 32 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2 111528 +system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Control::4 184 +system.ruby.network.ext_links0.int_node.throttle3.link_utilization 0.003119 +system.ruby.network.ext_links0.int_node.throttle3.msg_count.Request_Control::7 5 +system.ruby.network.ext_links0.int_node.throttle3.msg_count.Request_Control::8 4 +system.ruby.network.ext_links0.int_node.throttle3.msg_count.Response_Data::2 9 +system.ruby.network.ext_links0.int_node.throttle3.msg_count.Response_Control::4 11 +system.ruby.network.ext_links0.int_node.throttle3.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Request_Control::7 40 +system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Request_Control::8 32 +system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Response_Control::4 88 +system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links2.int_node.throttle0.link_utilization 0.372290 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Control::0 23 +system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::2 1549 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Control::0 184 +system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::2 111528 +system.ruby.network.ext_links2.int_node.throttle1.link_utilization 0.090620 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Request_Control::0 1549 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Request_Control::7 274 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Request_Control::8 4 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Control::4 23 +system.ruby.network.ext_links2.int_node.throttle1.msg_count.Unblock_Control::4 1549 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Request_Control::0 12392 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Request_Control::7 2192 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Request_Control::8 32 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Control::4 184 +system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Unblock_Control::4 12392 +system.ruby.network.ext_links2.int_node.throttle2.link_utilization 0.094646 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Request_Control::0 1549 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Data::2 19 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Control::2 281 +system.ruby.network.ext_links2.int_node.throttle2.msg_count.Unblock_Control::4 1549 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Request_Control::0 12392 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Data::2 1368 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Control::2 2248 +system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Unblock_Control::4 12392 +system.ruby.network.ext_links4.int_node.throttle0.link_utilization 0.000933 +system.ruby.network.ext_links4.int_node.throttle0.msg_count.Response_Data::3 3 +system.ruby.network.ext_links4.int_node.throttle0.msg_count.Writeback_Control::3 8 +system.ruby.network.ext_links4.int_node.throttle0.msg_bytes.Response_Data::3 216 +system.ruby.network.ext_links4.int_node.throttle0.msg_bytes.Writeback_Control::3 64 +system.ruby.network.ext_links4.int_node.throttle1.link_utilization 0.000933 +system.ruby.network.ext_links4.int_node.throttle1.msg_count.Response_Data::3 3 +system.ruby.network.ext_links4.int_node.throttle1.msg_count.Writeback_Control::3 8 +system.ruby.network.ext_links4.int_node.throttle1.msg_bytes.Response_Data::3 216 +system.ruby.network.ext_links4.int_node.throttle1.msg_bytes.Writeback_Control::3 64 +system.ruby.network.ext_links4.int_node.throttle2.link_utilization 0.007438 +system.ruby.network.ext_links4.int_node.throttle2.msg_count.Control::0 11 +system.ruby.network.ext_links4.int_node.throttle2.msg_count.Data::1 18 +system.ruby.network.ext_links4.int_node.throttle2.msg_count.Request_Control::1 9 +system.ruby.network.ext_links4.int_node.throttle2.msg_count.Response_Data::2 9 +system.ruby.network.ext_links4.int_node.throttle2.msg_count.Writeback_Control::2 16 +system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Control::0 88 +system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Data::1 1296 +system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Request_Control::1 72 +system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Response_Data::2 648 +system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Writeback_Control::2 128 +system.ruby.network.ext_links4.int_node.throttle3.link_utilization 0.001200 +system.ruby.network.ext_links4.int_node.throttle3.msg_count.Response_Data::3 5 +system.ruby.network.ext_links4.int_node.throttle3.msg_bytes.Response_Data::3 360 +system.ruby.network.ext_links4.int_node.throttle4.link_utilization 0.005705 +system.ruby.network.ext_links4.int_node.throttle4.msg_count.Data::0 18 +system.ruby.network.ext_links4.int_node.throttle4.msg_count.Request_Control::0 7 +system.ruby.network.ext_links4.int_node.throttle4.msg_count.Request_Control::7 5 +system.ruby.network.ext_links4.int_node.throttle4.msg_count.Request_Control::8 4 +system.ruby.network.ext_links4.int_node.throttle4.msg_count.Response_Control::4 11 +system.ruby.network.ext_links4.int_node.throttle4.msg_count.Unblock_Control::4 25 +system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Data::0 1296 +system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Request_Control::0 56 +system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Request_Control::7 40 +system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Request_Control::8 32 +system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Response_Control::4 88 +system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Unblock_Control::4 200 +system.ruby.network.ext_links4.int_node.throttle5.link_utilization 0.004852 +system.ruby.network.ext_links4.int_node.throttle5.msg_count.Data::0 16 +system.ruby.network.ext_links4.int_node.throttle5.msg_count.Request_Control::0 9 +system.ruby.network.ext_links4.int_node.throttle5.msg_count.Response_Control::2 22 +system.ruby.network.ext_links4.int_node.throttle5.msg_count.Unblock_Control::4 7 +system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Data::0 1152 +system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Request_Control::0 72 +system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Response_Control::2 176 +system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Unblock_Control::4 56 +system.ruby.CorePair_Controller.C0_Load_L1miss 193 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Load_L1hit 16142 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1hit 85994 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1miss 1101 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1miss 327 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1hit 10446 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckS 1047 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckM 329 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckE 173 0.00% 0.00% +system.ruby.CorePair_Controller.L1I_Repl 602 0.00% 0.00% +system.ruby.CorePair_Controller.L1D0_Repl 28 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1D0 7 0.00% 0.00% +system.ruby.CorePair_Controller.L2_to_L1I 67 0.00% 0.00% +system.ruby.CorePair_Controller.PrbInvData 15 0.00% 0.00% +system.ruby.CorePair_Controller.PrbInvDataDemand 2 0.00% 0.00% +system.ruby.CorePair_Controller.PrbShrData 4 0.00% 0.00% +system.ruby.CorePair_Controller.PrbShrDataDemand 2 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Load_L1miss 186 0.00% 0.00% +system.ruby.CorePair_Controller.I.Ifetch0_L1miss 1034 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Store_L1miss 325 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbInvDataDemand 1 0.00% 0.00% +system.ruby.CorePair_Controller.S.C0_Load_L1hit 643 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1hit 85994 0.00% 0.00% +system.ruby.CorePair_Controller.S.Ifetch0_L1miss 67 0.00% 0.00% +system.ruby.CorePair_Controller.S.C0_Store_L1hit 4 0.00% 0.00% +system.ruby.CorePair_Controller.S.L1I_Repl 602 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Load_L1hit 2728 0.00% 0.00% +system.ruby.CorePair_Controller.E0.C0_Store_L1hit 50 0.00% 0.00% +system.ruby.CorePair_Controller.E0.L1D0_Repl 16 0.00% 0.00% +system.ruby.CorePair_Controller.E0.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.E0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.E0.PrbShrDataDemand 1 0.00% 0.00% +system.ruby.CorePair_Controller.O.PrbInvData 4 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1miss 5 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Load_L1hit 12771 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Store_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Store_L1hit 10392 0.00% 0.00% +system.ruby.CorePair_Controller.M0.L1D0_Repl 12 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbInvData 10 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbInvDataDemand 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbShrData 3 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbShrDataDemand 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.NB_AckM 325 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckS 13 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckE 173 0.00% 0.00% +system.ruby.CorePair_Controller.Si_F0.L2_to_L1I 67 0.00% 0.00% +system.ruby.CorePair_Controller.S_M0.NB_AckM 4 0.00% 0.00% +system.ruby.CorePair_Controller.S0.NB_AckS 1034 0.00% 0.00% +system.ruby.CorePair_Controller.E0_F.L2_to_L1D0 2 0.00% 0.00% +system.ruby.CorePair_Controller.M0_F.L2_to_L1D0 5 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkS 190 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkM 31 0.00% 0.00% +system.ruby.Directory_Controller.RdBlk 56 0.00% 0.00% +system.ruby.Directory_Controller.WriteThrough 1 0.00% 0.00% +system.ruby.Directory_Controller.Atomic 1 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkSP 844 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkMP 298 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkP 137 0.00% 0.00% +system.ruby.Directory_Controller.WriteThroughP 15 0.00% 0.00% +system.ruby.Directory_Controller.AtomicP 1 0.00% 0.00% +system.ruby.Directory_Controller.CPUPrbResp 28 0.00% 0.00% +system.ruby.Directory_Controller.LastCPUPrbResp 8 0.00% 0.00% +system.ruby.Directory_Controller.ProbeAcksComplete 271 0.00% 0.00% +system.ruby.Directory_Controller.L3Hit 11 0.00% 0.00% +system.ruby.Directory_Controller.MemData 1563 0.00% 0.00% +system.ruby.Directory_Controller.CoreUnblock 1556 0.00% 0.00% +system.ruby.Directory_Controller.UnblockWriteThrough 18 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkS 190 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkM 31 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlk 56 0.00% 0.00% +system.ruby.Directory_Controller.U.WriteThrough 1 0.00% 0.00% +system.ruby.Directory_Controller.U.Atomic 1 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkSP 844 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkMP 298 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkP 137 0.00% 0.00% +system.ruby.Directory_Controller.U.WriteThroughP 15 0.00% 0.00% +system.ruby.Directory_Controller.U.AtomicP 1 0.00% 0.00% +system.ruby.Directory_Controller.U.CPUPrbResp 28 0.00% 0.00% +system.ruby.Directory_Controller.BS_M.MemData 1034 0.00% 0.00% +system.ruby.Directory_Controller.BM_M.MemData 347 0.00% 0.00% +system.ruby.Directory_Controller.B_M.L3Hit 11 0.00% 0.00% +system.ruby.Directory_Controller.B_M.MemData 180 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete 190 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.LastCPUPrbResp 4 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete 29 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.LastCPUPrbResp 2 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.ProbeAcksComplete 52 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.MemData 2 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.LastCPUPrbResp 2 0.00% 0.00% +system.ruby.Directory_Controller.B.CoreUnblock 1556 0.00% 0.00% +system.ruby.Directory_Controller.B.UnblockWriteThrough 18 0.00% 0.00% +system.ruby.RegionBuffer_Controller.CPURead | 1220 99.43% 99.43% | 7 0.57% 100.00% +system.ruby.RegionBuffer_Controller.CPURead::total 1227 +system.ruby.RegionBuffer_Controller.CPUWrite | 331 89.95% 89.95% | 37 10.05% 100.00% +system.ruby.RegionBuffer_Controller.CPUWrite::total 368 +system.ruby.RegionBuffer_Controller.PrivateNotify | 272 98.91% 98.91% | 3 1.09% 100.00% +system.ruby.RegionBuffer_Controller.PrivateNotify::total 275 +system.ruby.RegionBuffer_Controller.SharedNotify | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.SharedNotify::total 4 +system.ruby.RegionBuffer_Controller.InvRegion | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.InvRegion::total 4 +system.ruby.RegionBuffer_Controller.DowngradeRegion | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.DowngradeRegion::total 4 +system.ruby.RegionBuffer_Controller.InvAck | 23 67.65% 67.65% | 11 32.35% 100.00% +system.ruby.RegionBuffer_Controller.InvAck::total 34 +system.ruby.RegionBuffer_Controller.DoneAck | 1572 96.26% 96.26% | 61 3.74% 100.00% +system.ruby.RegionBuffer_Controller.DoneAck::total 1633 +system.ruby.RegionBuffer_Controller.AllOutstanding | 6 54.55% 54.55% | 5 45.45% 100.00% +system.ruby.RegionBuffer_Controller.AllOutstanding::total 11 +system.ruby.RegionBuffer_Controller.Evict | 64 66.67% 66.67% | 32 33.33% 100.00% +system.ruby.RegionBuffer_Controller.Evict::total 96 +system.ruby.RegionBuffer_Controller.LastAck_PrbResp | 4 50.00% 50.00% | 4 50.00% 100.00% +system.ruby.RegionBuffer_Controller.LastAck_PrbResp::total 8 +system.ruby.RegionBuffer_Controller.StallAccess | 0 0.00% 0.00% | 16 100.00% 100.00% +system.ruby.RegionBuffer_Controller.StallAccess::total 16 +system.ruby.RegionBuffer_Controller.NP.CPURead | 243 98.78% 98.78% | 3 1.22% 100.00% +system.ruby.RegionBuffer_Controller.NP.CPURead::total 246 +system.ruby.RegionBuffer_Controller.NP.CPUWrite | 29 96.67% 96.67% | 1 3.33% 100.00% +system.ruby.RegionBuffer_Controller.NP.CPUWrite::total 30 +system.ruby.RegionBuffer_Controller.P.CPURead | 965 99.59% 99.59% | 4 0.41% 100.00% +system.ruby.RegionBuffer_Controller.P.CPURead::total 969 +system.ruby.RegionBuffer_Controller.P.CPUWrite | 298 94.90% 94.90% | 16 5.10% 100.00% +system.ruby.RegionBuffer_Controller.P.CPUWrite::total 314 +system.ruby.RegionBuffer_Controller.P.InvRegion | 1 100.00% 100.00% | 0 0.00% 100.00% +system.ruby.RegionBuffer_Controller.P.InvRegion::total 1 +system.ruby.RegionBuffer_Controller.P.DowngradeRegion | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.P.DowngradeRegion::total 4 +system.ruby.RegionBuffer_Controller.P.DoneAck | 1535 98.52% 98.52% | 23 1.48% 100.00% +system.ruby.RegionBuffer_Controller.P.DoneAck::total 1558 +system.ruby.RegionBuffer_Controller.P.StallAccess | 0 0.00% 0.00% | 15 100.00% 100.00% +system.ruby.RegionBuffer_Controller.P.StallAccess::total 15 +system.ruby.RegionBuffer_Controller.S.CPURead | 12 100.00% 100.00% | 0 0.00% 100.00% +system.ruby.RegionBuffer_Controller.S.CPURead::total 12 +system.ruby.RegionBuffer_Controller.S.CPUWrite | 2 66.67% 66.67% | 1 33.33% 100.00% +system.ruby.RegionBuffer_Controller.S.CPUWrite::total 3 +system.ruby.RegionBuffer_Controller.S.InvRegion | 1 33.33% 33.33% | 2 66.67% 100.00% +system.ruby.RegionBuffer_Controller.S.InvRegion::total 3 +system.ruby.RegionBuffer_Controller.S.DoneAck | 14 87.50% 87.50% | 2 12.50% 100.00% +system.ruby.RegionBuffer_Controller.S.DoneAck::total 16 +system.ruby.RegionBuffer_Controller.NP_PS.PrivateNotify | 270 99.26% 99.26% | 2 0.74% 100.00% +system.ruby.RegionBuffer_Controller.NP_PS.PrivateNotify::total 272 +system.ruby.RegionBuffer_Controller.NP_PS.SharedNotify | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.NP_PS.SharedNotify::total 4 +system.ruby.RegionBuffer_Controller.NP_PS.DoneAck | 8 25.81% 25.81% | 23 74.19% 100.00% +system.ruby.RegionBuffer_Controller.NP_PS.DoneAck::total 31 +system.ruby.RegionBuffer_Controller.NP_PS.StallAccess | 0 0.00% 0.00% | 1 100.00% 100.00% +system.ruby.RegionBuffer_Controller.NP_PS.StallAccess::total 1 +system.ruby.RegionBuffer_Controller.S_P.CPUWrite | 0 0.00% 0.00% | 18 100.00% 100.00% +system.ruby.RegionBuffer_Controller.S_P.CPUWrite::total 18 +system.ruby.RegionBuffer_Controller.S_P.PrivateNotify | 2 66.67% 66.67% | 1 33.33% 100.00% +system.ruby.RegionBuffer_Controller.S_P.PrivateNotify::total 3 +system.ruby.RegionBuffer_Controller.S_P.DoneAck | 15 53.57% 53.57% | 13 46.43% 100.00% +system.ruby.RegionBuffer_Controller.S_P.DoneAck::total 28 +system.ruby.RegionBuffer_Controller.P_NP.InvAck | 17 60.71% 60.71% | 11 39.29% 100.00% +system.ruby.RegionBuffer_Controller.P_NP.InvAck::total 28 +system.ruby.RegionBuffer_Controller.P_NP.Evict | 32 50.00% 50.00% | 32 50.00% 100.00% +system.ruby.RegionBuffer_Controller.P_NP.Evict::total 64 +system.ruby.RegionBuffer_Controller.P_NP.LastAck_PrbResp | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.P_NP.LastAck_PrbResp::total 4 +system.ruby.RegionBuffer_Controller.P_S.InvAck | 6 100.00% 100.00% | 0 0.00% 100.00% +system.ruby.RegionBuffer_Controller.P_S.InvAck::total 6 +system.ruby.RegionBuffer_Controller.P_S.Evict | 32 100.00% 100.00% | 0 0.00% 100.00% +system.ruby.RegionBuffer_Controller.P_S.Evict::total 32 +system.ruby.RegionBuffer_Controller.P_S.LastAck_PrbResp | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.P_S.LastAck_PrbResp::total 4 +system.ruby.RegionBuffer_Controller.P_NP_O.AllOutstanding | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.P_NP_O.AllOutstanding::total 4 +system.ruby.RegionBuffer_Controller.P_S_O.AllOutstanding | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.RegionBuffer_Controller.P_S_O.AllOutstanding::total 4 +system.ruby.RegionBuffer_Controller.S_O.AllOutstanding | 2 66.67% 66.67% | 1 33.33% 100.00% +system.ruby.RegionBuffer_Controller.S_O.AllOutstanding::total 3 +system.ruby.RegionBuffer_Controller.SS_P.CPUWrite | 2 66.67% 66.67% | 1 33.33% 100.00% +system.ruby.RegionBuffer_Controller.SS_P.CPUWrite::total 3 +system.ruby.RegionDir_Controller.SendInv 1 0.00% 0.00% +system.ruby.RegionDir_Controller.SendUpgrade 3 0.00% 0.00% +system.ruby.RegionDir_Controller.SendDowngrade 4 0.00% 0.00% +system.ruby.RegionDir_Controller.PrivateRequest 271 0.00% 0.00% +system.ruby.RegionDir_Controller.InvAckCore 4 0.00% 0.00% +system.ruby.RegionDir_Controller.InvAckCoreNoShare 4 0.00% 0.00% +system.ruby.RegionDir_Controller.CPUPrivateAck 278 0.00% 0.00% +system.ruby.RegionDir_Controller.LastAck 8 0.00% 0.00% +system.ruby.RegionDir_Controller.DirReadyAck 8 0.00% 0.00% +system.ruby.RegionDir_Controller.TriggerInv 4 0.00% 0.00% +system.ruby.RegionDir_Controller.TriggerDowngrade 4 0.00% 0.00% +system.ruby.RegionDir_Controller.NP.PrivateRequest 271 0.00% 0.00% +system.ruby.RegionDir_Controller.P.SendInv 1 0.00% 0.00% +system.ruby.RegionDir_Controller.P.SendDowngrade 4 0.00% 0.00% +system.ruby.RegionDir_Controller.S.SendUpgrade 3 0.00% 0.00% +system.ruby.RegionDir_Controller.NP_P.CPUPrivateAck 270 0.00% 0.00% +system.ruby.RegionDir_Controller.P_P.CPUPrivateAck 1 0.00% 0.00% +system.ruby.RegionDir_Controller.P_S.CPUPrivateAck 4 0.00% 0.00% +system.ruby.RegionDir_Controller.S_P.CPUPrivateAck 3 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AS.InvAckCore 4 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AS.LastAck 4 0.00% 0.00% +system.ruby.RegionDir_Controller.S_AP.InvAckCoreNoShare 3 0.00% 0.00% +system.ruby.RegionDir_Controller.S_AP.LastAck 3 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AP.InvAckCoreNoShare 1 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AP.LastAck 1 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AP_W.DirReadyAck 1 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AP_W.TriggerInv 1 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AS_W.DirReadyAck 4 0.00% 0.00% +system.ruby.RegionDir_Controller.P_AS_W.TriggerDowngrade 4 0.00% 0.00% +system.ruby.RegionDir_Controller.S_AP_W.DirReadyAck 3 0.00% 0.00% +system.ruby.RegionDir_Controller.S_AP_W.TriggerInv 3 0.00% 0.00% +system.ruby.LD.latency_hist::bucket_size 64 +system.ruby.LD.latency_hist::max_bucket 639 +system.ruby.LD.latency_hist::samples 16335 +system.ruby.LD.latency_hist::mean 2.844751 +system.ruby.LD.latency_hist::gmean 1.060634 +system.ruby.LD.latency_hist::stdev 17.742972 +system.ruby.LD.latency_hist | 16149 98.86% 98.86% | 11 0.07% 98.93% | 119 0.73% 99.66% | 52 0.32% 99.98% | 2 0.01% 99.99% | 1 0.01% 99.99% | 1 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.latency_hist::total 16335 +system.ruby.LD.hit_latency_hist::bucket_size 64 +system.ruby.LD.hit_latency_hist::max_bucket 639 +system.ruby.LD.hit_latency_hist::samples 186 +system.ruby.LD.hit_latency_hist::mean 162.333333 +system.ruby.LD.hit_latency_hist::gmean 157.431876 +system.ruby.LD.hit_latency_hist::stdev 43.755298 +system.ruby.LD.hit_latency_hist | 0 0.00% 0.00% | 11 5.91% 5.91% | 119 63.98% 69.89% | 52 27.96% 97.85% | 2 1.08% 98.92% | 1 0.54% 99.46% | 1 0.54% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.hit_latency_hist::total 186 +system.ruby.LD.miss_latency_hist::bucket_size 2 +system.ruby.LD.miss_latency_hist::max_bucket 19 +system.ruby.LD.miss_latency_hist::samples 16149 +system.ruby.LD.miss_latency_hist::mean 1.007802 +system.ruby.LD.miss_latency_hist::gmean 1.001277 +system.ruby.LD.miss_latency_hist::stdev 0.374686 +system.ruby.LD.miss_latency_hist | 16142 99.96% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 0 0.00% 99.96% | 7 0.04% 100.00% +system.ruby.LD.miss_latency_hist::total 16149 +system.ruby.ST.latency_hist::bucket_size 64 +system.ruby.ST.latency_hist::max_bucket 639 +system.ruby.ST.latency_hist::samples 10412 +system.ruby.ST.latency_hist::mean 5.551287 +system.ruby.ST.latency_hist::gmean 1.167783 +system.ruby.ST.latency_hist::stdev 26.172531 +system.ruby.ST.latency_hist | 10087 96.88% 96.88% | 0 0.00% 96.88% | 289 2.78% 99.65% | 29 0.28% 99.93% | 4 0.04% 99.97% | 2 0.02% 99.99% | 0 0.00% 99.99% | 1 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.latency_hist::total 10412 +system.ruby.ST.hit_latency_hist::bucket_size 64 +system.ruby.ST.hit_latency_hist::max_bucket 639 +system.ruby.ST.hit_latency_hist::samples 325 +system.ruby.ST.hit_latency_hist::mean 146.809231 +system.ruby.ST.hit_latency_hist::gmean 143.903653 +system.ruby.ST.hit_latency_hist::stdev 36.751508 +system.ruby.ST.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 289 88.92% 88.92% | 29 8.92% 97.85% | 4 1.23% 99.08% | 2 0.62% 99.69% | 0 0.00% 99.69% | 1 0.31% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.hit_latency_hist::total 325 +system.ruby.ST.miss_latency_hist::bucket_size 1 +system.ruby.ST.miss_latency_hist::max_bucket 9 +system.ruby.ST.miss_latency_hist::samples 10087 +system.ruby.ST.miss_latency_hist::mean 1 +system.ruby.ST.miss_latency_hist::gmean 1 +system.ruby.ST.miss_latency_hist | 0 0.00% 0.00% | 10087 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.miss_latency_hist::total 10087 +system.ruby.IFETCH.latency_hist::bucket_size 64 +system.ruby.IFETCH.latency_hist::max_bucket 639 +system.ruby.IFETCH.latency_hist::samples 87095 +system.ruby.IFETCH.latency_hist::mean 2.818945 +system.ruby.IFETCH.latency_hist::gmean 1.063630 +system.ruby.IFETCH.latency_hist::stdev 17.067789 +system.ruby.IFETCH.latency_hist | 86061 98.81% 98.81% | 0 0.00% 98.81% | 826 0.95% 99.76% | 185 0.21% 99.97% | 8 0.01% 99.98% | 9 0.01% 99.99% | 6 0.01% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.latency_hist::total 87095 +system.ruby.IFETCH.hit_latency_hist::bucket_size 64 +system.ruby.IFETCH.hit_latency_hist::max_bucket 639 +system.ruby.IFETCH.hit_latency_hist::samples 1034 +system.ruby.IFETCH.hit_latency_hist::mean 153.045455 +system.ruby.IFETCH.hit_latency_hist::gmean 149.192268 +system.ruby.IFETCH.hit_latency_hist::stdev 40.969954 +system.ruby.IFETCH.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 826 79.88% 79.88% | 185 17.89% 97.78% | 8 0.77% 98.55% | 9 0.87% 99.42% | 6 0.58% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.hit_latency_hist::total 1034 +system.ruby.IFETCH.miss_latency_hist::bucket_size 2 +system.ruby.IFETCH.miss_latency_hist::max_bucket 19 +system.ruby.IFETCH.miss_latency_hist::samples 86061 +system.ruby.IFETCH.miss_latency_hist::mean 1.014013 +system.ruby.IFETCH.miss_latency_hist::gmean 1.002295 +system.ruby.IFETCH.miss_latency_hist::stdev 0.502042 +system.ruby.IFETCH.miss_latency_hist | 85994 99.92% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 0 0.00% 99.92% | 67 0.08% 100.00% +system.ruby.IFETCH.miss_latency_hist::total 86061 +system.ruby.RMW_Read.latency_hist::bucket_size 32 +system.ruby.RMW_Read.latency_hist::max_bucket 319 +system.ruby.RMW_Read.latency_hist::samples 341 +system.ruby.RMW_Read.latency_hist::mean 2.671554 +system.ruby.RMW_Read.latency_hist::gmean 1.059947 +system.ruby.RMW_Read.latency_hist::stdev 15.416875 +system.ruby.RMW_Read.latency_hist | 337 98.83% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 0 0.00% 98.83% | 3 0.88% 99.71% | 1 0.29% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.latency_hist::total 341 +system.ruby.RMW_Read.hit_latency_hist::bucket_size 32 +system.ruby.RMW_Read.hit_latency_hist::max_bucket 319 +system.ruby.RMW_Read.hit_latency_hist::samples 4 +system.ruby.RMW_Read.hit_latency_hist::mean 143.500000 +system.ruby.RMW_Read.hit_latency_hist::gmean 143.041358 +system.ruby.RMW_Read.hit_latency_hist::stdev 13.403980 +system.ruby.RMW_Read.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 3 75.00% 75.00% | 1 25.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.hit_latency_hist::total 4 +system.ruby.RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.RMW_Read.miss_latency_hist::samples 337 +system.ruby.RMW_Read.miss_latency_hist::mean 1 +system.ruby.RMW_Read.miss_latency_hist::gmean 1 +system.ruby.RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.miss_latency_hist::total 337 +system.ruby.Locked_RMW_Read.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.latency_hist::samples 10 +system.ruby.Locked_RMW_Read.latency_hist::mean 1 +system.ruby.Locked_RMW_Read.latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.latency_hist::total 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.miss_latency_hist::mean 1 +system.ruby.Locked_RMW_Read.miss_latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.miss_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.miss_latency_hist::total 10 +system.ruby.Locked_RMW_Write.latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.latency_hist::samples 10 +system.ruby.Locked_RMW_Write.latency_hist::mean 1 +system.ruby.Locked_RMW_Write.latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.latency_hist::total 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.miss_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.miss_latency_hist::mean 1 +system.ruby.Locked_RMW_Write.miss_latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.miss_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.miss_latency_hist::total 10 +system.ruby.L1Cache.miss_mach_latency_hist::bucket_size 1 +system.ruby.L1Cache.miss_mach_latency_hist::max_bucket 9 +system.ruby.L1Cache.miss_mach_latency_hist::samples 112580 +system.ruby.L1Cache.miss_mach_latency_hist::mean 1 +system.ruby.L1Cache.miss_mach_latency_hist::gmean 1 +system.ruby.L1Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 112580 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L1Cache.miss_mach_latency_hist::total 112580 +system.ruby.L2Cache.miss_mach_latency_hist::bucket_size 2 +system.ruby.L2Cache.miss_mach_latency_hist::max_bucket 19 +system.ruby.L2Cache.miss_mach_latency_hist::samples 74 +system.ruby.L2Cache.miss_mach_latency_hist::mean 19 +system.ruby.L2Cache.miss_mach_latency_hist::gmean 19.000000 +system.ruby.L2Cache.miss_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 74 100.00% 100.00% +system.ruby.L2Cache.miss_mach_latency_hist::total 74 +system.ruby.L3Cache.hit_mach_latency_hist::bucket_size 16 +system.ruby.L3Cache.hit_mach_latency_hist::max_bucket 159 +system.ruby.L3Cache.hit_mach_latency_hist::samples 11 +system.ruby.L3Cache.hit_mach_latency_hist::mean 107 +system.ruby.L3Cache.hit_mach_latency_hist::gmean 107.000000 +system.ruby.L3Cache.hit_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 11 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L3Cache.hit_mach_latency_hist::total 11 +system.ruby.Directory.hit_mach_latency_hist::bucket_size 64 +system.ruby.Directory.hit_mach_latency_hist::max_bucket 639 +system.ruby.Directory.hit_mach_latency_hist::samples 1538 +system.ruby.Directory.hit_mach_latency_hist::mean 153.155397 +system.ruby.Directory.hit_mach_latency_hist::gmean 149.362802 +system.ruby.Directory.hit_mach_latency_hist::stdev 40.587599 +system.ruby.Directory.hit_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 1238 80.49% 80.49% | 266 17.30% 97.79% | 14 0.91% 98.70% | 12 0.78% 99.48% | 7 0.46% 99.93% | 1 0.07% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Directory.hit_mach_latency_hist::total 1538 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples 16142 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.LD.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 16142 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total 16142 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size 2 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket 19 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples 7 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean 19 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean 19.000000 +system.ruby.LD.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 7 100.00% 100.00% +system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total 7 +system.ruby.LD.L3Cache.hit_type_mach_latency_hist::bucket_size 16 +system.ruby.LD.L3Cache.hit_type_mach_latency_hist::max_bucket 159 +system.ruby.LD.L3Cache.hit_type_mach_latency_hist::samples 11 +system.ruby.LD.L3Cache.hit_type_mach_latency_hist::mean 107 +system.ruby.LD.L3Cache.hit_type_mach_latency_hist::gmean 107.000000 +system.ruby.LD.L3Cache.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 11 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.L3Cache.hit_type_mach_latency_hist::total 11 +system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.LD.Directory.hit_type_mach_latency_hist::samples 175 +system.ruby.LD.Directory.hit_type_mach_latency_hist::mean 165.811429 +system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean 161.300002 +system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev 42.776536 +system.ruby.LD.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 119 68.00% 68.00% | 52 29.71% 97.71% | 2 1.14% 98.86% | 1 0.57% 99.43% | 1 0.57% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.Directory.hit_type_mach_latency_hist::total 175 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples 10087 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10087 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total 10087 +system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.ST.Directory.hit_type_mach_latency_hist::samples 325 +system.ruby.ST.Directory.hit_type_mach_latency_hist::mean 146.809231 +system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean 143.903653 +system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev 36.751508 +system.ruby.ST.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 289 88.92% 88.92% | 29 8.92% 97.85% | 4 1.23% 99.08% | 2 0.62% 99.69% | 0 0.00% 99.69% | 1 0.31% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.Directory.hit_type_mach_latency_hist::total 325 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples 85994 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 85994 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total 85994 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size 2 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket 19 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples 67 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean 19 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean 19.000000 +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 67 100.00% 100.00% +system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total 67 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size 64 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket 639 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples 1034 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean 153.045455 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean 149.192268 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev 40.969954 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 826 79.88% 79.88% | 185 17.89% 97.78% | 8 0.77% 98.55% | 9 0.87% 99.42% | 6 0.58% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total 1034 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 337 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 337 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total 337 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size 32 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket 319 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples 4 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean 143.500000 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean 143.041358 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::stdev 13.403980 +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 3 75.00% 75.00% | 1 25.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total 4 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket 9 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples 10 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean 1 +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist | 0 0.00% 0.00% | 10 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total 10 +system.ruby.SQC_Controller.Fetch 86 0.00% 0.00% +system.ruby.SQC_Controller.Data 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Fetch 5 0.00% 0.00% +system.ruby.SQC_Controller.I.Data 5 0.00% 0.00% +system.ruby.SQC_Controller.V.Fetch 81 0.00% 0.00% +system.ruby.TCC_Controller.RdBlk 9 0.00% 0.00% +system.ruby.TCC_Controller.WrVicBlk 16 0.00% 0.00% +system.ruby.TCC_Controller.Atomic 2 0.00% 0.00% +system.ruby.TCC_Controller.AtomicDone 1 0.00% 0.00% +system.ruby.TCC_Controller.Data 9 0.00% 0.00% +system.ruby.TCC_Controller.PrbInv 11 0.00% 0.00% +system.ruby.TCC_Controller.WBAck 16 0.00% 0.00% +system.ruby.TCC_Controller.V.PrbInv 1 0.00% 0.00% +system.ruby.TCC_Controller.I.RdBlk 7 0.00% 0.00% +system.ruby.TCC_Controller.I.WrVicBlk 16 0.00% 0.00% +system.ruby.TCC_Controller.I.Atomic 1 0.00% 0.00% +system.ruby.TCC_Controller.I.PrbInv 10 0.00% 0.00% +system.ruby.TCC_Controller.I.WBAck 16 0.00% 0.00% +system.ruby.TCC_Controller.IV.RdBlk 2 0.00% 0.00% +system.ruby.TCC_Controller.IV.Data 7 0.00% 0.00% +system.ruby.TCC_Controller.A.Atomic 1 0.00% 0.00% +system.ruby.TCC_Controller.A.AtomicDone 1 0.00% 0.00% +system.ruby.TCC_Controller.A.Data 2 0.00% 0.00% +system.ruby.TCP_Controller.Load | 5 50.00% 50.00% | 5 50.00% 100.00% +system.ruby.TCP_Controller.Load::total 10 +system.ruby.TCP_Controller.StoreThrough | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.StoreThrough::total 16 +system.ruby.TCP_Controller.Atomic | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.Atomic::total 2 +system.ruby.TCP_Controller.Flush | 768 50.00% 50.00% | 768 50.00% 100.00% +system.ruby.TCP_Controller.Flush::total 1536 +system.ruby.TCP_Controller.Evict | 512 50.00% 50.00% | 512 50.00% 100.00% +system.ruby.TCP_Controller.Evict::total 1024 +system.ruby.TCP_Controller.TCC_Ack | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.TCC_Ack::total 6 +system.ruby.TCP_Controller.TCC_AckWB | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.TCC_AckWB::total 16 +system.ruby.TCP_Controller.I.Load | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.Load::total 4 +system.ruby.TCP_Controller.I.StoreThrough | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.I.StoreThrough::total 16 +system.ruby.TCP_Controller.I.Atomic | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.I.Atomic::total 2 +system.ruby.TCP_Controller.I.Flush | 766 50.00% 50.00% | 766 50.00% 100.00% +system.ruby.TCP_Controller.I.Flush::total 1532 +system.ruby.TCP_Controller.I.Evict | 510 50.00% 50.00% | 510 50.00% 100.00% +system.ruby.TCP_Controller.I.Evict::total 1020 +system.ruby.TCP_Controller.I.TCC_Ack | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.I.TCC_Ack::total 4 +system.ruby.TCP_Controller.I.TCC_AckWB | 8 50.00% 50.00% | 8 50.00% 100.00% +system.ruby.TCP_Controller.I.TCC_AckWB::total 16 +system.ruby.TCP_Controller.V.Load | 3 50.00% 50.00% | 3 50.00% 100.00% +system.ruby.TCP_Controller.V.Load::total 6 +system.ruby.TCP_Controller.V.Flush | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.V.Flush::total 4 +system.ruby.TCP_Controller.V.Evict | 2 50.00% 50.00% | 2 50.00% 100.00% +system.ruby.TCP_Controller.V.Evict::total 4 +system.ruby.TCP_Controller.A.TCC_Ack | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.A.TCC_Ack::total 2 + +---------- End Simulation Statistics ---------- diff --git a/tests/quick/se/04.gpu/test.py b/tests/quick/se/04.gpu/test.py new file mode 100644 index 000000000..a074a8144 --- /dev/null +++ b/tests/quick/se/04.gpu/test.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Brad Beckmann +# +executable = binpath('gpu-hello') +kernel_path = os.path.dirname(executable) +kernel_files = glob.glob(os.path.join(kernel_path, '*.asm')) +if kernel_files: + print "Using GPU kernel code file(s)", ",".join(kernel_files) +else: + fatal("Can't locate kernel code (.asm) in " + kernel_path) + +driver = ClDriver(filename="hsa", codefile=kernel_files) +root.system.cpu[2].cl_driver = driver +root.system.cpu[0].workload = LiveProcess(cmd = 'gpu-hello', + executable = binpath('gpu-hello'), + drivers = [driver]) + diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/config.ini b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/config.ini new file mode 100644 index 000000000..06da5f023 --- /dev/null +++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/config.ini @@ -0,0 +1,5862 @@ +[root] +type=Root +children=system +eventq_index=0 +full_system=false +sim_quantum=0 +time_sync_enable=false +time_sync_period=100000000 +time_sync_spin_threshold=100000 + +[system] +type=System +children=clk_domain cp_cntrl0 cpu dir_cntrl0 dvfs_handler mem_ctrls ruby sqc_cntrl0 sqc_cntrl1 sys_port_proxy tcc_cntrl0 tccdir_cntrl0 tcp_cntrl0 tcp_cntrl1 tcp_cntrl2 tcp_cntrl3 tcp_cntrl4 tcp_cntrl5 tcp_cntrl6 tcp_cntrl7 voltage_domain +boot_osflags=a +cache_line_size=64 +clk_domain=system.clk_domain +eventq_index=0 +exit_on_work_items=false +init_param=0 +kernel= +kernel_addr_check=true +load_addr_mask=1099511627775 +load_offset=0 +mem_mode=timing +mem_ranges=0:268435455 +memories=system.mem_ctrls +mmap_using_noreserve=false +multi_thread=false +num_work_ids=16 +readfile= +symbolfile= +work_begin_ckpt_count=0 +work_begin_cpu_id_exit=-1 +work_begin_exit_count=0 +work_cpus_ckpt_count=0 +work_end_ckpt_count=0 +work_end_exit_count=0 +work_item_id=-1 +system_port=system.sys_port_proxy.slave[0] + +[system.clk_domain] +type=SrcClockDomain +clock=1 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.cp_cntrl0] +type=CorePair_Controller +children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore +L1D0cache=system.cp_cntrl0.L1D0cache +L1D1cache=system.cp_cntrl0.L1D1cache +L1Icache=system.cp_cntrl0.L1Icache +L2cache=system.cp_cntrl0.L2cache +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=15 +l2_hit_latency=18 +mandatoryQueue=system.cp_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToCore=system.cp_cntrl0.probeToCore +recycle_latency=10 +requestFromCore=system.cp_cntrl0.requestFromCore +responseFromCore=system.cp_cntrl0.responseFromCore +responseToCore=system.cp_cntrl0.responseToCore +ruby_system=system.ruby +send_evictions=true +sequencer=system.cp_cntrl0.sequencer +sequencer1=system.cp_cntrl0.sequencer1 +system=system +transitions_per_cycle=32 +triggerQueue=system.cp_cntrl0.triggerQueue +unblockFromCore=system.cp_cntrl0.unblockFromCore +version=0 + +[system.cp_cntrl0.L1D0cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=256 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L1D0cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=256 + +[system.cp_cntrl0.L1D1cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=256 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L1D1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=256 + +[system.cp_cntrl0.L1Icache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=256 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L1Icache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=256 + +[system.cp_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=2 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.cp_cntrl0.L2cache.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=512 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.cp_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=2 +block_size=64 +eventq_index=0 +size=512 + +[system.cp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.cp_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[3] + +[system.cp_cntrl0.requestFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[2] + +[system.cp_cntrl0.responseFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[3] + +[system.cp_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[4] + +[system.cp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=0 +dcache=system.cp_cntrl0.L1D0cache +dcache_hit_latency=2 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=2 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=true +version=0 +slave=system.cpu.cpuInstDataPort[0] + +[system.cp_cntrl0.sequencer1] +type=RubySequencer +clk_domain=system.clk_domain +coreid=1 +dcache=system.cp_cntrl0.L1D1cache +dcache_hit_latency=2 +deadlock_threshold=500000 +eventq_index=0 +icache=system.cp_cntrl0.L1Icache +icache_hit_latency=2 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=true +version=1 +slave=system.cpu.cpuInstDataPort[1] + +[system.cp_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.cp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[4] + +[system.cpu] +type=RubyTester +check_flush=false +checks_to_complete=100 +clk_domain=system.clk_domain +deadlock_threshold=50000 +eventq_index=0 +num_cpus=12 +system=system +wakeup_frequency=10 +cpuDataPort=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl2.coalescer.slave[0] system.tcp_cntrl3.coalescer.slave[0] system.tcp_cntrl4.coalescer.slave[0] system.tcp_cntrl5.coalescer.slave[0] system.tcp_cntrl6.coalescer.slave[0] system.tcp_cntrl7.coalescer.slave[0] +cpuInstDataPort=system.cp_cntrl0.sequencer.slave[0] system.cp_cntrl0.sequencer1.slave[0] +cpuInstPort=system.sqc_cntrl0.sequencer.slave[0] system.sqc_cntrl1.sequencer.slave[0] + +[system.dir_cntrl0] +type=Directory_Controller +children=L3CacheMemory L3triggerQueue directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores +CPUonly=false +L3CacheMemory=system.dir_cntrl0.L3CacheMemory +L3triggerQueue=system.dir_cntrl0.L3triggerQueue +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +directory=system.dir_cntrl0.directory +eventq_index=0 +l3_hit_latency=15 +noTCCdir=false +number_of_TBEs=20480 +probeToCore=system.dir_cntrl0.probeToCore +recycle_latency=10 +requestFromCores=system.dir_cntrl0.requestFromCores +responseFromCores=system.dir_cntrl0.responseFromCores +responseFromMemory=system.dir_cntrl0.responseFromMemory +responseToCore=system.dir_cntrl0.responseToCore +response_latency=30 +ruby_system=system.ruby +system=system +to_memory_controller_latency=1 +transitions_per_cycle=32 +triggerQueue=system.dir_cntrl0.triggerQueue +unblockFromCores=system.dir_cntrl0.unblockFromCores +useL3OnWT=false +version=0 +memory=system.mem_ctrls.port + +[system.dir_cntrl0.L3CacheMemory] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=20 +dataArrayBanks=256.0 +eventq_index=0 +is_icache=false +replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=1024 +start_index_bit=6 +tagAccessLatency=15 +tagArrayBanks=256.0 + +[system.dir_cntrl0.L3CacheMemory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=1024 + +[system.dir_cntrl0.L3triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.directory] +type=RubyDirectoryMemory +eventq_index=0 +numa_high_bit=5 +size=536870912 +version=0 + +[system.dir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[0] + +[system.dir_cntrl0.requestFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[0] + +[system.dir_cntrl0.responseFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[1] + +[system.dir_cntrl0.responseFromMemory] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.dir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[1] + +[system.dir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.dir_cntrl0.unblockFromCores] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[2] + +[system.dvfs_handler] +type=DVFSHandler +domains= +enable=false +eventq_index=0 +sys_clk_domain=system.clk_domain +transition_latency=100000 + +[system.mem_ctrls] +type=DRAMCtrl +IDD0=0.075000 +IDD02=0.000000 +IDD2N=0.050000 +IDD2N2=0.000000 +IDD2P0=0.000000 +IDD2P02=0.000000 +IDD2P1=0.000000 +IDD2P12=0.000000 +IDD3N=0.057000 +IDD3N2=0.000000 +IDD3P0=0.000000 +IDD3P02=0.000000 +IDD3P1=0.000000 +IDD3P12=0.000000 +IDD4R=0.187000 +IDD4R2=0.000000 +IDD4W=0.165000 +IDD4W2=0.000000 +IDD5=0.220000 +IDD52=0.000000 +IDD6=0.000000 +IDD62=0.000000 +VDD=1.500000 +VDD2=0.000000 +activation_limit=4 +addr_mapping=RoRaBaCoCh +bank_groups_per_rank=0 +banks_per_rank=8 +burst_length=8 +channels=1 +clk_domain=system.clk_domain +conf_table_reported=true +device_bus_width=8 +device_rowbuffer_size=1024 +device_size=536870912 +devices_per_rank=8 +dll=true +eventq_index=0 +in_addr_map=true +max_accesses_per_row=16 +mem_sched_policy=frfcfs +min_writes_per_switch=16 +null=false +page_policy=open_adaptive +range=0:268435455 +ranks_per_channel=2 +read_buffer_size=32 +static_backend_latency=10 +static_frontend_latency=10 +tBURST=5 +tCCD_L=0 +tCK=1 +tCL=14 +tCS=3 +tRAS=35 +tRCD=14 +tREFI=7800 +tRFC=260 +tRP=14 +tRRD=6 +tRRD_L=0 +tRTP=8 +tRTW=3 +tWR=15 +tWTR=8 +tXAW=30 +tXP=0 +tXPDLL=0 +tXS=0 +tXSDLL=0 +write_buffer_size=64 +write_high_thresh_perc=85 +write_low_thresh_perc=50 +port=system.dir_cntrl0.memory + +[system.ruby] +type=RubySystem +children=clk_domain network +access_backing_store=false +all_instructions=false +block_size_bytes=64 +clk_domain=system.ruby.clk_domain +eventq_index=0 +hot_lines=false +memory_size_bits=48 +num_of_sequencers=12 +number_of_virtual_networks=10 +phys_mem=Null +randomization=true + +[system.ruby.clk_domain] +type=SrcClockDomain +clock=1 +domain_id=-1 +eventq_index=0 +init_perf_level=0 +voltage_domain=system.voltage_domain + +[system.ruby.network] +type=SimpleNetwork +children=ext_links00 ext_links01 ext_links02 ext_links03 ext_links04 ext_links05 ext_links06 ext_links07 ext_links08 ext_links09 ext_links10 ext_links11 ext_links12 ext_links13 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1 +adaptive_routing=false +buffer_size=0 +clk_domain=system.ruby.clk_domain +control_msg_size=8 +endpoint_bandwidth=1000 +eventq_index=0 +ext_links=system.ruby.network.ext_links00 system.ruby.network.ext_links01 system.ruby.network.ext_links02 system.ruby.network.ext_links03 system.ruby.network.ext_links04 system.ruby.network.ext_links05 system.ruby.network.ext_links06 system.ruby.network.ext_links07 system.ruby.network.ext_links08 system.ruby.network.ext_links09 system.ruby.network.ext_links10 system.ruby.network.ext_links11 system.ruby.network.ext_links12 system.ruby.network.ext_links13 +int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39 +int_links=system.ruby.network.int_links0 system.ruby.network.int_links1 +netifs= +number_of_virtual_networks=10 +routers=system.ruby.network.ext_links00.int_node system.ruby.network.ext_links01.int_node system.ruby.network.ext_links02.int_node +ruby_system=system.ruby +topology=Crossbar +master=system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.tcp_cntrl2.probeToTCP.slave system.tcp_cntrl2.responseToTCP.slave system.tcp_cntrl3.probeToTCP.slave system.tcp_cntrl3.responseToTCP.slave system.tcp_cntrl4.probeToTCP.slave system.tcp_cntrl4.responseToTCP.slave system.tcp_cntrl5.probeToTCP.slave system.tcp_cntrl5.responseToTCP.slave system.tcp_cntrl6.probeToTCP.slave system.tcp_cntrl6.responseToTCP.slave system.tcp_cntrl7.probeToTCP.slave system.tcp_cntrl7.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.sqc_cntrl1.probeToSQC.slave system.sqc_cntrl1.responseToSQC.slave system.tcc_cntrl0.responseToTCC.slave system.tccdir_cntrl0.requestFromTCP.slave system.tccdir_cntrl0.responseFromTCP.slave system.tccdir_cntrl0.unblockFromTCP.slave system.tccdir_cntrl0.probeFromNB.slave system.tccdir_cntrl0.responseFromNB.slave +slave=system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.tcp_cntrl2.requestFromTCP.master system.tcp_cntrl2.responseFromTCP.master system.tcp_cntrl2.unblockFromCore.master system.tcp_cntrl3.requestFromTCP.master system.tcp_cntrl3.responseFromTCP.master system.tcp_cntrl3.unblockFromCore.master system.tcp_cntrl4.requestFromTCP.master system.tcp_cntrl4.responseFromTCP.master system.tcp_cntrl4.unblockFromCore.master system.tcp_cntrl5.requestFromTCP.master system.tcp_cntrl5.responseFromTCP.master system.tcp_cntrl5.unblockFromCore.master system.tcp_cntrl6.requestFromTCP.master system.tcp_cntrl6.responseFromTCP.master system.tcp_cntrl6.unblockFromCore.master system.tcp_cntrl7.requestFromTCP.master system.tcp_cntrl7.responseFromTCP.master system.tcp_cntrl7.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.sqc_cntrl0.responseFromSQC.master system.sqc_cntrl0.unblockFromCore.master system.sqc_cntrl1.requestFromSQC.master system.sqc_cntrl1.responseFromSQC.master system.sqc_cntrl1.unblockFromCore.master system.tcc_cntrl0.responseFromTCC.master system.tccdir_cntrl0.probeToCore.master system.tccdir_cntrl0.responseToCore.master system.tccdir_cntrl0.requestToNB.master system.tccdir_cntrl0.responseToNB.master system.tccdir_cntrl0.unblockToNB.master + +[system.ruby.network.ext_links00] +type=SimpleExtLink +children=int_node +bandwidth_factor=512 +eventq_index=0 +ext_node=system.dir_cntrl0 +int_node=system.ruby.network.ext_links00.int_node +latency=1 +link_id=0 +weight=1 + +[system.ruby.network.ext_links00.int_node] +type=Switch +children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 port_buffers110 port_buffers111 port_buffers112 port_buffers113 port_buffers114 port_buffers115 port_buffers116 port_buffers117 port_buffers118 port_buffers119 port_buffers120 port_buffers121 port_buffers122 port_buffers123 port_buffers124 port_buffers125 port_buffers126 port_buffers127 port_buffers128 port_buffers129 port_buffers130 port_buffers131 port_buffers132 port_buffers133 port_buffers134 port_buffers135 port_buffers136 port_buffers137 port_buffers138 port_buffers139 port_buffers140 port_buffers141 port_buffers142 port_buffers143 port_buffers144 port_buffers145 port_buffers146 port_buffers147 port_buffers148 port_buffers149 port_buffers150 port_buffers151 port_buffers152 port_buffers153 port_buffers154 port_buffers155 port_buffers156 port_buffers157 port_buffers158 port_buffers159 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links00.int_node.port_buffers000 system.ruby.network.ext_links00.int_node.port_buffers001 system.ruby.network.ext_links00.int_node.port_buffers002 system.ruby.network.ext_links00.int_node.port_buffers003 system.ruby.network.ext_links00.int_node.port_buffers004 system.ruby.network.ext_links00.int_node.port_buffers005 system.ruby.network.ext_links00.int_node.port_buffers006 system.ruby.network.ext_links00.int_node.port_buffers007 system.ruby.network.ext_links00.int_node.port_buffers008 system.ruby.network.ext_links00.int_node.port_buffers009 system.ruby.network.ext_links00.int_node.port_buffers010 system.ruby.network.ext_links00.int_node.port_buffers011 system.ruby.network.ext_links00.int_node.port_buffers012 system.ruby.network.ext_links00.int_node.port_buffers013 system.ruby.network.ext_links00.int_node.port_buffers014 system.ruby.network.ext_links00.int_node.port_buffers015 system.ruby.network.ext_links00.int_node.port_buffers016 system.ruby.network.ext_links00.int_node.port_buffers017 system.ruby.network.ext_links00.int_node.port_buffers018 system.ruby.network.ext_links00.int_node.port_buffers019 system.ruby.network.ext_links00.int_node.port_buffers020 system.ruby.network.ext_links00.int_node.port_buffers021 system.ruby.network.ext_links00.int_node.port_buffers022 system.ruby.network.ext_links00.int_node.port_buffers023 system.ruby.network.ext_links00.int_node.port_buffers024 system.ruby.network.ext_links00.int_node.port_buffers025 system.ruby.network.ext_links00.int_node.port_buffers026 system.ruby.network.ext_links00.int_node.port_buffers027 system.ruby.network.ext_links00.int_node.port_buffers028 system.ruby.network.ext_links00.int_node.port_buffers029 system.ruby.network.ext_links00.int_node.port_buffers030 system.ruby.network.ext_links00.int_node.port_buffers031 system.ruby.network.ext_links00.int_node.port_buffers032 system.ruby.network.ext_links00.int_node.port_buffers033 system.ruby.network.ext_links00.int_node.port_buffers034 system.ruby.network.ext_links00.int_node.port_buffers035 system.ruby.network.ext_links00.int_node.port_buffers036 system.ruby.network.ext_links00.int_node.port_buffers037 system.ruby.network.ext_links00.int_node.port_buffers038 system.ruby.network.ext_links00.int_node.port_buffers039 system.ruby.network.ext_links00.int_node.port_buffers040 system.ruby.network.ext_links00.int_node.port_buffers041 system.ruby.network.ext_links00.int_node.port_buffers042 system.ruby.network.ext_links00.int_node.port_buffers043 system.ruby.network.ext_links00.int_node.port_buffers044 system.ruby.network.ext_links00.int_node.port_buffers045 system.ruby.network.ext_links00.int_node.port_buffers046 system.ruby.network.ext_links00.int_node.port_buffers047 system.ruby.network.ext_links00.int_node.port_buffers048 system.ruby.network.ext_links00.int_node.port_buffers049 system.ruby.network.ext_links00.int_node.port_buffers050 system.ruby.network.ext_links00.int_node.port_buffers051 system.ruby.network.ext_links00.int_node.port_buffers052 system.ruby.network.ext_links00.int_node.port_buffers053 system.ruby.network.ext_links00.int_node.port_buffers054 system.ruby.network.ext_links00.int_node.port_buffers055 system.ruby.network.ext_links00.int_node.port_buffers056 system.ruby.network.ext_links00.int_node.port_buffers057 system.ruby.network.ext_links00.int_node.port_buffers058 system.ruby.network.ext_links00.int_node.port_buffers059 system.ruby.network.ext_links00.int_node.port_buffers060 system.ruby.network.ext_links00.int_node.port_buffers061 system.ruby.network.ext_links00.int_node.port_buffers062 system.ruby.network.ext_links00.int_node.port_buffers063 system.ruby.network.ext_links00.int_node.port_buffers064 system.ruby.network.ext_links00.int_node.port_buffers065 system.ruby.network.ext_links00.int_node.port_buffers066 system.ruby.network.ext_links00.int_node.port_buffers067 system.ruby.network.ext_links00.int_node.port_buffers068 system.ruby.network.ext_links00.int_node.port_buffers069 system.ruby.network.ext_links00.int_node.port_buffers070 system.ruby.network.ext_links00.int_node.port_buffers071 system.ruby.network.ext_links00.int_node.port_buffers072 system.ruby.network.ext_links00.int_node.port_buffers073 system.ruby.network.ext_links00.int_node.port_buffers074 system.ruby.network.ext_links00.int_node.port_buffers075 system.ruby.network.ext_links00.int_node.port_buffers076 system.ruby.network.ext_links00.int_node.port_buffers077 system.ruby.network.ext_links00.int_node.port_buffers078 system.ruby.network.ext_links00.int_node.port_buffers079 system.ruby.network.ext_links00.int_node.port_buffers080 system.ruby.network.ext_links00.int_node.port_buffers081 system.ruby.network.ext_links00.int_node.port_buffers082 system.ruby.network.ext_links00.int_node.port_buffers083 system.ruby.network.ext_links00.int_node.port_buffers084 system.ruby.network.ext_links00.int_node.port_buffers085 system.ruby.network.ext_links00.int_node.port_buffers086 system.ruby.network.ext_links00.int_node.port_buffers087 system.ruby.network.ext_links00.int_node.port_buffers088 system.ruby.network.ext_links00.int_node.port_buffers089 system.ruby.network.ext_links00.int_node.port_buffers090 system.ruby.network.ext_links00.int_node.port_buffers091 system.ruby.network.ext_links00.int_node.port_buffers092 system.ruby.network.ext_links00.int_node.port_buffers093 system.ruby.network.ext_links00.int_node.port_buffers094 system.ruby.network.ext_links00.int_node.port_buffers095 system.ruby.network.ext_links00.int_node.port_buffers096 system.ruby.network.ext_links00.int_node.port_buffers097 system.ruby.network.ext_links00.int_node.port_buffers098 system.ruby.network.ext_links00.int_node.port_buffers099 system.ruby.network.ext_links00.int_node.port_buffers100 system.ruby.network.ext_links00.int_node.port_buffers101 system.ruby.network.ext_links00.int_node.port_buffers102 system.ruby.network.ext_links00.int_node.port_buffers103 system.ruby.network.ext_links00.int_node.port_buffers104 system.ruby.network.ext_links00.int_node.port_buffers105 system.ruby.network.ext_links00.int_node.port_buffers106 system.ruby.network.ext_links00.int_node.port_buffers107 system.ruby.network.ext_links00.int_node.port_buffers108 system.ruby.network.ext_links00.int_node.port_buffers109 system.ruby.network.ext_links00.int_node.port_buffers110 system.ruby.network.ext_links00.int_node.port_buffers111 system.ruby.network.ext_links00.int_node.port_buffers112 system.ruby.network.ext_links00.int_node.port_buffers113 system.ruby.network.ext_links00.int_node.port_buffers114 system.ruby.network.ext_links00.int_node.port_buffers115 system.ruby.network.ext_links00.int_node.port_buffers116 system.ruby.network.ext_links00.int_node.port_buffers117 system.ruby.network.ext_links00.int_node.port_buffers118 system.ruby.network.ext_links00.int_node.port_buffers119 system.ruby.network.ext_links00.int_node.port_buffers120 system.ruby.network.ext_links00.int_node.port_buffers121 system.ruby.network.ext_links00.int_node.port_buffers122 system.ruby.network.ext_links00.int_node.port_buffers123 system.ruby.network.ext_links00.int_node.port_buffers124 system.ruby.network.ext_links00.int_node.port_buffers125 system.ruby.network.ext_links00.int_node.port_buffers126 system.ruby.network.ext_links00.int_node.port_buffers127 system.ruby.network.ext_links00.int_node.port_buffers128 system.ruby.network.ext_links00.int_node.port_buffers129 system.ruby.network.ext_links00.int_node.port_buffers130 system.ruby.network.ext_links00.int_node.port_buffers131 system.ruby.network.ext_links00.int_node.port_buffers132 system.ruby.network.ext_links00.int_node.port_buffers133 system.ruby.network.ext_links00.int_node.port_buffers134 system.ruby.network.ext_links00.int_node.port_buffers135 system.ruby.network.ext_links00.int_node.port_buffers136 system.ruby.network.ext_links00.int_node.port_buffers137 system.ruby.network.ext_links00.int_node.port_buffers138 system.ruby.network.ext_links00.int_node.port_buffers139 system.ruby.network.ext_links00.int_node.port_buffers140 system.ruby.network.ext_links00.int_node.port_buffers141 system.ruby.network.ext_links00.int_node.port_buffers142 system.ruby.network.ext_links00.int_node.port_buffers143 system.ruby.network.ext_links00.int_node.port_buffers144 system.ruby.network.ext_links00.int_node.port_buffers145 system.ruby.network.ext_links00.int_node.port_buffers146 system.ruby.network.ext_links00.int_node.port_buffers147 system.ruby.network.ext_links00.int_node.port_buffers148 system.ruby.network.ext_links00.int_node.port_buffers149 system.ruby.network.ext_links00.int_node.port_buffers150 system.ruby.network.ext_links00.int_node.port_buffers151 system.ruby.network.ext_links00.int_node.port_buffers152 system.ruby.network.ext_links00.int_node.port_buffers153 system.ruby.network.ext_links00.int_node.port_buffers154 system.ruby.network.ext_links00.int_node.port_buffers155 system.ruby.network.ext_links00.int_node.port_buffers156 system.ruby.network.ext_links00.int_node.port_buffers157 system.ruby.network.ext_links00.int_node.port_buffers158 system.ruby.network.ext_links00.int_node.port_buffers159 +router_id=0 +virt_nets=10 + +[system.ruby.network.ext_links00.int_node.port_buffers000] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers001] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers002] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers003] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers004] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers005] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers006] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers007] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers008] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers009] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers010] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers011] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers012] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers013] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers014] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers015] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers016] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers017] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers018] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers019] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers020] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers021] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers022] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers023] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers024] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers025] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers026] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers027] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers028] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers029] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers030] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers031] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers032] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers033] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers034] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers035] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers036] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers037] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers038] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers039] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers040] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers041] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers042] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers043] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers044] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers045] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers046] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers047] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers048] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers049] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers050] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers051] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers052] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers053] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers054] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers055] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers056] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers057] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers058] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers059] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers060] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers061] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers062] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers063] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers064] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers065] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers066] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers067] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers068] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers069] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers070] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers071] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers072] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers073] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers074] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers075] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers076] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers077] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers078] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers079] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers080] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers081] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers082] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers083] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers084] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers085] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers086] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers087] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers088] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers089] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers090] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers091] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers092] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers093] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers094] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers095] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers096] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers097] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers098] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers099] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers100] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers101] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers102] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers103] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers104] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers105] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers106] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers107] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers108] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers109] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers110] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers111] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers112] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers113] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers114] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers115] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers116] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers117] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers118] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers119] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers120] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers121] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers122] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers123] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers124] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers125] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers126] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers127] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers128] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers129] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers130] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers131] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers132] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers133] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers134] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers135] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers136] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers137] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers138] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers139] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers140] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers141] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers142] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers143] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers144] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers145] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers146] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers147] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers148] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers149] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers150] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers151] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers152] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers153] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers154] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers155] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers156] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers157] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers158] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links00.int_node.port_buffers159] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01] +type=SimpleExtLink +children=int_node +bandwidth_factor=512 +eventq_index=0 +ext_node=system.cp_cntrl0 +int_node=system.ruby.network.ext_links01.int_node +latency=1 +link_id=1 +weight=1 + +[system.ruby.network.ext_links01.int_node] +type=Switch +children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 port_buffers110 port_buffers111 port_buffers112 port_buffers113 port_buffers114 port_buffers115 port_buffers116 port_buffers117 port_buffers118 port_buffers119 port_buffers120 port_buffers121 port_buffers122 port_buffers123 port_buffers124 port_buffers125 port_buffers126 port_buffers127 port_buffers128 port_buffers129 port_buffers130 port_buffers131 port_buffers132 port_buffers133 port_buffers134 port_buffers135 port_buffers136 port_buffers137 port_buffers138 port_buffers139 port_buffers140 port_buffers141 port_buffers142 port_buffers143 port_buffers144 port_buffers145 port_buffers146 port_buffers147 port_buffers148 port_buffers149 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links01.int_node.port_buffers000 system.ruby.network.ext_links01.int_node.port_buffers001 system.ruby.network.ext_links01.int_node.port_buffers002 system.ruby.network.ext_links01.int_node.port_buffers003 system.ruby.network.ext_links01.int_node.port_buffers004 system.ruby.network.ext_links01.int_node.port_buffers005 system.ruby.network.ext_links01.int_node.port_buffers006 system.ruby.network.ext_links01.int_node.port_buffers007 system.ruby.network.ext_links01.int_node.port_buffers008 system.ruby.network.ext_links01.int_node.port_buffers009 system.ruby.network.ext_links01.int_node.port_buffers010 system.ruby.network.ext_links01.int_node.port_buffers011 system.ruby.network.ext_links01.int_node.port_buffers012 system.ruby.network.ext_links01.int_node.port_buffers013 system.ruby.network.ext_links01.int_node.port_buffers014 system.ruby.network.ext_links01.int_node.port_buffers015 system.ruby.network.ext_links01.int_node.port_buffers016 system.ruby.network.ext_links01.int_node.port_buffers017 system.ruby.network.ext_links01.int_node.port_buffers018 system.ruby.network.ext_links01.int_node.port_buffers019 system.ruby.network.ext_links01.int_node.port_buffers020 system.ruby.network.ext_links01.int_node.port_buffers021 system.ruby.network.ext_links01.int_node.port_buffers022 system.ruby.network.ext_links01.int_node.port_buffers023 system.ruby.network.ext_links01.int_node.port_buffers024 system.ruby.network.ext_links01.int_node.port_buffers025 system.ruby.network.ext_links01.int_node.port_buffers026 system.ruby.network.ext_links01.int_node.port_buffers027 system.ruby.network.ext_links01.int_node.port_buffers028 system.ruby.network.ext_links01.int_node.port_buffers029 system.ruby.network.ext_links01.int_node.port_buffers030 system.ruby.network.ext_links01.int_node.port_buffers031 system.ruby.network.ext_links01.int_node.port_buffers032 system.ruby.network.ext_links01.int_node.port_buffers033 system.ruby.network.ext_links01.int_node.port_buffers034 system.ruby.network.ext_links01.int_node.port_buffers035 system.ruby.network.ext_links01.int_node.port_buffers036 system.ruby.network.ext_links01.int_node.port_buffers037 system.ruby.network.ext_links01.int_node.port_buffers038 system.ruby.network.ext_links01.int_node.port_buffers039 system.ruby.network.ext_links01.int_node.port_buffers040 system.ruby.network.ext_links01.int_node.port_buffers041 system.ruby.network.ext_links01.int_node.port_buffers042 system.ruby.network.ext_links01.int_node.port_buffers043 system.ruby.network.ext_links01.int_node.port_buffers044 system.ruby.network.ext_links01.int_node.port_buffers045 system.ruby.network.ext_links01.int_node.port_buffers046 system.ruby.network.ext_links01.int_node.port_buffers047 system.ruby.network.ext_links01.int_node.port_buffers048 system.ruby.network.ext_links01.int_node.port_buffers049 system.ruby.network.ext_links01.int_node.port_buffers050 system.ruby.network.ext_links01.int_node.port_buffers051 system.ruby.network.ext_links01.int_node.port_buffers052 system.ruby.network.ext_links01.int_node.port_buffers053 system.ruby.network.ext_links01.int_node.port_buffers054 system.ruby.network.ext_links01.int_node.port_buffers055 system.ruby.network.ext_links01.int_node.port_buffers056 system.ruby.network.ext_links01.int_node.port_buffers057 system.ruby.network.ext_links01.int_node.port_buffers058 system.ruby.network.ext_links01.int_node.port_buffers059 system.ruby.network.ext_links01.int_node.port_buffers060 system.ruby.network.ext_links01.int_node.port_buffers061 system.ruby.network.ext_links01.int_node.port_buffers062 system.ruby.network.ext_links01.int_node.port_buffers063 system.ruby.network.ext_links01.int_node.port_buffers064 system.ruby.network.ext_links01.int_node.port_buffers065 system.ruby.network.ext_links01.int_node.port_buffers066 system.ruby.network.ext_links01.int_node.port_buffers067 system.ruby.network.ext_links01.int_node.port_buffers068 system.ruby.network.ext_links01.int_node.port_buffers069 system.ruby.network.ext_links01.int_node.port_buffers070 system.ruby.network.ext_links01.int_node.port_buffers071 system.ruby.network.ext_links01.int_node.port_buffers072 system.ruby.network.ext_links01.int_node.port_buffers073 system.ruby.network.ext_links01.int_node.port_buffers074 system.ruby.network.ext_links01.int_node.port_buffers075 system.ruby.network.ext_links01.int_node.port_buffers076 system.ruby.network.ext_links01.int_node.port_buffers077 system.ruby.network.ext_links01.int_node.port_buffers078 system.ruby.network.ext_links01.int_node.port_buffers079 system.ruby.network.ext_links01.int_node.port_buffers080 system.ruby.network.ext_links01.int_node.port_buffers081 system.ruby.network.ext_links01.int_node.port_buffers082 system.ruby.network.ext_links01.int_node.port_buffers083 system.ruby.network.ext_links01.int_node.port_buffers084 system.ruby.network.ext_links01.int_node.port_buffers085 system.ruby.network.ext_links01.int_node.port_buffers086 system.ruby.network.ext_links01.int_node.port_buffers087 system.ruby.network.ext_links01.int_node.port_buffers088 system.ruby.network.ext_links01.int_node.port_buffers089 system.ruby.network.ext_links01.int_node.port_buffers090 system.ruby.network.ext_links01.int_node.port_buffers091 system.ruby.network.ext_links01.int_node.port_buffers092 system.ruby.network.ext_links01.int_node.port_buffers093 system.ruby.network.ext_links01.int_node.port_buffers094 system.ruby.network.ext_links01.int_node.port_buffers095 system.ruby.network.ext_links01.int_node.port_buffers096 system.ruby.network.ext_links01.int_node.port_buffers097 system.ruby.network.ext_links01.int_node.port_buffers098 system.ruby.network.ext_links01.int_node.port_buffers099 system.ruby.network.ext_links01.int_node.port_buffers100 system.ruby.network.ext_links01.int_node.port_buffers101 system.ruby.network.ext_links01.int_node.port_buffers102 system.ruby.network.ext_links01.int_node.port_buffers103 system.ruby.network.ext_links01.int_node.port_buffers104 system.ruby.network.ext_links01.int_node.port_buffers105 system.ruby.network.ext_links01.int_node.port_buffers106 system.ruby.network.ext_links01.int_node.port_buffers107 system.ruby.network.ext_links01.int_node.port_buffers108 system.ruby.network.ext_links01.int_node.port_buffers109 system.ruby.network.ext_links01.int_node.port_buffers110 system.ruby.network.ext_links01.int_node.port_buffers111 system.ruby.network.ext_links01.int_node.port_buffers112 system.ruby.network.ext_links01.int_node.port_buffers113 system.ruby.network.ext_links01.int_node.port_buffers114 system.ruby.network.ext_links01.int_node.port_buffers115 system.ruby.network.ext_links01.int_node.port_buffers116 system.ruby.network.ext_links01.int_node.port_buffers117 system.ruby.network.ext_links01.int_node.port_buffers118 system.ruby.network.ext_links01.int_node.port_buffers119 system.ruby.network.ext_links01.int_node.port_buffers120 system.ruby.network.ext_links01.int_node.port_buffers121 system.ruby.network.ext_links01.int_node.port_buffers122 system.ruby.network.ext_links01.int_node.port_buffers123 system.ruby.network.ext_links01.int_node.port_buffers124 system.ruby.network.ext_links01.int_node.port_buffers125 system.ruby.network.ext_links01.int_node.port_buffers126 system.ruby.network.ext_links01.int_node.port_buffers127 system.ruby.network.ext_links01.int_node.port_buffers128 system.ruby.network.ext_links01.int_node.port_buffers129 system.ruby.network.ext_links01.int_node.port_buffers130 system.ruby.network.ext_links01.int_node.port_buffers131 system.ruby.network.ext_links01.int_node.port_buffers132 system.ruby.network.ext_links01.int_node.port_buffers133 system.ruby.network.ext_links01.int_node.port_buffers134 system.ruby.network.ext_links01.int_node.port_buffers135 system.ruby.network.ext_links01.int_node.port_buffers136 system.ruby.network.ext_links01.int_node.port_buffers137 system.ruby.network.ext_links01.int_node.port_buffers138 system.ruby.network.ext_links01.int_node.port_buffers139 system.ruby.network.ext_links01.int_node.port_buffers140 system.ruby.network.ext_links01.int_node.port_buffers141 system.ruby.network.ext_links01.int_node.port_buffers142 system.ruby.network.ext_links01.int_node.port_buffers143 system.ruby.network.ext_links01.int_node.port_buffers144 system.ruby.network.ext_links01.int_node.port_buffers145 system.ruby.network.ext_links01.int_node.port_buffers146 system.ruby.network.ext_links01.int_node.port_buffers147 system.ruby.network.ext_links01.int_node.port_buffers148 system.ruby.network.ext_links01.int_node.port_buffers149 +router_id=1 +virt_nets=10 + +[system.ruby.network.ext_links01.int_node.port_buffers000] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers001] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers002] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers003] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers004] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers005] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers006] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers007] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers008] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers009] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers010] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers011] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers012] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers013] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers014] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers015] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers016] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers017] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers018] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers019] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers020] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers021] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers022] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers023] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers024] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers025] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers026] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers027] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers028] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers029] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers030] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers031] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers032] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers033] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers034] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers035] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers036] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers037] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers038] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers039] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers040] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers041] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers042] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers043] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers044] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers045] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers046] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers047] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers048] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers049] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers050] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers051] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers052] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers053] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers054] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers055] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers056] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers057] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers058] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers059] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers060] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers061] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers062] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers063] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers064] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers065] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers066] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers067] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers068] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers069] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers070] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers071] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers072] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers073] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers074] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers075] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers076] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers077] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers078] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers079] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers080] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers081] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers082] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers083] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers084] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers085] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers086] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers087] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers088] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers089] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers090] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers091] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers092] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers093] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers094] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers095] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers096] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers097] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers098] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers099] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers100] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers101] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers102] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers103] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers104] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers105] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers106] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers107] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers108] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers109] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers110] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers111] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers112] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers113] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers114] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers115] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers116] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers117] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers118] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers119] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers120] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers121] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers122] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers123] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers124] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers125] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers126] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers127] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers128] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers129] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers130] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers131] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers132] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers133] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers134] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers135] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers136] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers137] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers138] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers139] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers140] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers141] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers142] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers143] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers144] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers145] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers146] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers147] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers148] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links01.int_node.port_buffers149] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02] +type=SimpleExtLink +children=int_node +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl0 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=2 +weight=1 + +[system.ruby.network.ext_links02.int_node] +type=Switch +children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 port_buffers110 port_buffers111 port_buffers112 port_buffers113 port_buffers114 port_buffers115 port_buffers116 port_buffers117 port_buffers118 port_buffers119 port_buffers120 port_buffers121 port_buffers122 port_buffers123 port_buffers124 port_buffers125 port_buffers126 port_buffers127 port_buffers128 port_buffers129 port_buffers130 port_buffers131 port_buffers132 port_buffers133 port_buffers134 port_buffers135 port_buffers136 port_buffers137 port_buffers138 port_buffers139 port_buffers140 port_buffers141 port_buffers142 port_buffers143 port_buffers144 port_buffers145 port_buffers146 port_buffers147 port_buffers148 port_buffers149 +clk_domain=system.ruby.clk_domain +eventq_index=0 +port_buffers=system.ruby.network.ext_links02.int_node.port_buffers000 system.ruby.network.ext_links02.int_node.port_buffers001 system.ruby.network.ext_links02.int_node.port_buffers002 system.ruby.network.ext_links02.int_node.port_buffers003 system.ruby.network.ext_links02.int_node.port_buffers004 system.ruby.network.ext_links02.int_node.port_buffers005 system.ruby.network.ext_links02.int_node.port_buffers006 system.ruby.network.ext_links02.int_node.port_buffers007 system.ruby.network.ext_links02.int_node.port_buffers008 system.ruby.network.ext_links02.int_node.port_buffers009 system.ruby.network.ext_links02.int_node.port_buffers010 system.ruby.network.ext_links02.int_node.port_buffers011 system.ruby.network.ext_links02.int_node.port_buffers012 system.ruby.network.ext_links02.int_node.port_buffers013 system.ruby.network.ext_links02.int_node.port_buffers014 system.ruby.network.ext_links02.int_node.port_buffers015 system.ruby.network.ext_links02.int_node.port_buffers016 system.ruby.network.ext_links02.int_node.port_buffers017 system.ruby.network.ext_links02.int_node.port_buffers018 system.ruby.network.ext_links02.int_node.port_buffers019 system.ruby.network.ext_links02.int_node.port_buffers020 system.ruby.network.ext_links02.int_node.port_buffers021 system.ruby.network.ext_links02.int_node.port_buffers022 system.ruby.network.ext_links02.int_node.port_buffers023 system.ruby.network.ext_links02.int_node.port_buffers024 system.ruby.network.ext_links02.int_node.port_buffers025 system.ruby.network.ext_links02.int_node.port_buffers026 system.ruby.network.ext_links02.int_node.port_buffers027 system.ruby.network.ext_links02.int_node.port_buffers028 system.ruby.network.ext_links02.int_node.port_buffers029 system.ruby.network.ext_links02.int_node.port_buffers030 system.ruby.network.ext_links02.int_node.port_buffers031 system.ruby.network.ext_links02.int_node.port_buffers032 system.ruby.network.ext_links02.int_node.port_buffers033 system.ruby.network.ext_links02.int_node.port_buffers034 system.ruby.network.ext_links02.int_node.port_buffers035 system.ruby.network.ext_links02.int_node.port_buffers036 system.ruby.network.ext_links02.int_node.port_buffers037 system.ruby.network.ext_links02.int_node.port_buffers038 system.ruby.network.ext_links02.int_node.port_buffers039 system.ruby.network.ext_links02.int_node.port_buffers040 system.ruby.network.ext_links02.int_node.port_buffers041 system.ruby.network.ext_links02.int_node.port_buffers042 system.ruby.network.ext_links02.int_node.port_buffers043 system.ruby.network.ext_links02.int_node.port_buffers044 system.ruby.network.ext_links02.int_node.port_buffers045 system.ruby.network.ext_links02.int_node.port_buffers046 system.ruby.network.ext_links02.int_node.port_buffers047 system.ruby.network.ext_links02.int_node.port_buffers048 system.ruby.network.ext_links02.int_node.port_buffers049 system.ruby.network.ext_links02.int_node.port_buffers050 system.ruby.network.ext_links02.int_node.port_buffers051 system.ruby.network.ext_links02.int_node.port_buffers052 system.ruby.network.ext_links02.int_node.port_buffers053 system.ruby.network.ext_links02.int_node.port_buffers054 system.ruby.network.ext_links02.int_node.port_buffers055 system.ruby.network.ext_links02.int_node.port_buffers056 system.ruby.network.ext_links02.int_node.port_buffers057 system.ruby.network.ext_links02.int_node.port_buffers058 system.ruby.network.ext_links02.int_node.port_buffers059 system.ruby.network.ext_links02.int_node.port_buffers060 system.ruby.network.ext_links02.int_node.port_buffers061 system.ruby.network.ext_links02.int_node.port_buffers062 system.ruby.network.ext_links02.int_node.port_buffers063 system.ruby.network.ext_links02.int_node.port_buffers064 system.ruby.network.ext_links02.int_node.port_buffers065 system.ruby.network.ext_links02.int_node.port_buffers066 system.ruby.network.ext_links02.int_node.port_buffers067 system.ruby.network.ext_links02.int_node.port_buffers068 system.ruby.network.ext_links02.int_node.port_buffers069 system.ruby.network.ext_links02.int_node.port_buffers070 system.ruby.network.ext_links02.int_node.port_buffers071 system.ruby.network.ext_links02.int_node.port_buffers072 system.ruby.network.ext_links02.int_node.port_buffers073 system.ruby.network.ext_links02.int_node.port_buffers074 system.ruby.network.ext_links02.int_node.port_buffers075 system.ruby.network.ext_links02.int_node.port_buffers076 system.ruby.network.ext_links02.int_node.port_buffers077 system.ruby.network.ext_links02.int_node.port_buffers078 system.ruby.network.ext_links02.int_node.port_buffers079 system.ruby.network.ext_links02.int_node.port_buffers080 system.ruby.network.ext_links02.int_node.port_buffers081 system.ruby.network.ext_links02.int_node.port_buffers082 system.ruby.network.ext_links02.int_node.port_buffers083 system.ruby.network.ext_links02.int_node.port_buffers084 system.ruby.network.ext_links02.int_node.port_buffers085 system.ruby.network.ext_links02.int_node.port_buffers086 system.ruby.network.ext_links02.int_node.port_buffers087 system.ruby.network.ext_links02.int_node.port_buffers088 system.ruby.network.ext_links02.int_node.port_buffers089 system.ruby.network.ext_links02.int_node.port_buffers090 system.ruby.network.ext_links02.int_node.port_buffers091 system.ruby.network.ext_links02.int_node.port_buffers092 system.ruby.network.ext_links02.int_node.port_buffers093 system.ruby.network.ext_links02.int_node.port_buffers094 system.ruby.network.ext_links02.int_node.port_buffers095 system.ruby.network.ext_links02.int_node.port_buffers096 system.ruby.network.ext_links02.int_node.port_buffers097 system.ruby.network.ext_links02.int_node.port_buffers098 system.ruby.network.ext_links02.int_node.port_buffers099 system.ruby.network.ext_links02.int_node.port_buffers100 system.ruby.network.ext_links02.int_node.port_buffers101 system.ruby.network.ext_links02.int_node.port_buffers102 system.ruby.network.ext_links02.int_node.port_buffers103 system.ruby.network.ext_links02.int_node.port_buffers104 system.ruby.network.ext_links02.int_node.port_buffers105 system.ruby.network.ext_links02.int_node.port_buffers106 system.ruby.network.ext_links02.int_node.port_buffers107 system.ruby.network.ext_links02.int_node.port_buffers108 system.ruby.network.ext_links02.int_node.port_buffers109 system.ruby.network.ext_links02.int_node.port_buffers110 system.ruby.network.ext_links02.int_node.port_buffers111 system.ruby.network.ext_links02.int_node.port_buffers112 system.ruby.network.ext_links02.int_node.port_buffers113 system.ruby.network.ext_links02.int_node.port_buffers114 system.ruby.network.ext_links02.int_node.port_buffers115 system.ruby.network.ext_links02.int_node.port_buffers116 system.ruby.network.ext_links02.int_node.port_buffers117 system.ruby.network.ext_links02.int_node.port_buffers118 system.ruby.network.ext_links02.int_node.port_buffers119 system.ruby.network.ext_links02.int_node.port_buffers120 system.ruby.network.ext_links02.int_node.port_buffers121 system.ruby.network.ext_links02.int_node.port_buffers122 system.ruby.network.ext_links02.int_node.port_buffers123 system.ruby.network.ext_links02.int_node.port_buffers124 system.ruby.network.ext_links02.int_node.port_buffers125 system.ruby.network.ext_links02.int_node.port_buffers126 system.ruby.network.ext_links02.int_node.port_buffers127 system.ruby.network.ext_links02.int_node.port_buffers128 system.ruby.network.ext_links02.int_node.port_buffers129 system.ruby.network.ext_links02.int_node.port_buffers130 system.ruby.network.ext_links02.int_node.port_buffers131 system.ruby.network.ext_links02.int_node.port_buffers132 system.ruby.network.ext_links02.int_node.port_buffers133 system.ruby.network.ext_links02.int_node.port_buffers134 system.ruby.network.ext_links02.int_node.port_buffers135 system.ruby.network.ext_links02.int_node.port_buffers136 system.ruby.network.ext_links02.int_node.port_buffers137 system.ruby.network.ext_links02.int_node.port_buffers138 system.ruby.network.ext_links02.int_node.port_buffers139 system.ruby.network.ext_links02.int_node.port_buffers140 system.ruby.network.ext_links02.int_node.port_buffers141 system.ruby.network.ext_links02.int_node.port_buffers142 system.ruby.network.ext_links02.int_node.port_buffers143 system.ruby.network.ext_links02.int_node.port_buffers144 system.ruby.network.ext_links02.int_node.port_buffers145 system.ruby.network.ext_links02.int_node.port_buffers146 system.ruby.network.ext_links02.int_node.port_buffers147 system.ruby.network.ext_links02.int_node.port_buffers148 system.ruby.network.ext_links02.int_node.port_buffers149 +router_id=2 +virt_nets=10 + +[system.ruby.network.ext_links02.int_node.port_buffers000] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers001] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers002] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers003] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers004] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers005] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers006] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers007] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers008] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers009] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers010] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers011] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers012] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers013] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers014] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers015] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers016] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers017] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers018] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers019] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers020] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers021] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers022] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers023] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers024] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers025] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers026] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers027] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers028] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers029] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers030] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers031] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers032] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers033] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers034] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers035] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers036] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers037] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers038] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers039] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers040] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers041] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers042] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers043] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers044] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers045] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers046] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers047] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers048] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers049] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers050] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers051] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers052] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers053] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers054] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers055] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers056] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers057] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers058] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers059] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers060] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers061] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers062] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers063] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers064] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers065] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers066] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers067] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers068] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers069] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers070] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers071] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers072] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers073] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers074] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers075] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers076] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers077] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers078] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers079] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers080] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers081] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers082] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers083] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers084] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers085] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers086] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers087] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers088] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers089] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers090] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers091] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers092] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers093] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers094] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers095] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers096] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers097] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers098] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers099] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers100] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers101] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers102] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers103] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers104] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers105] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers106] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers107] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers108] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers109] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers110] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers111] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers112] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers113] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers114] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers115] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers116] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers117] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers118] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers119] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers120] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers121] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers122] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers123] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers124] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers125] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers126] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers127] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers128] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers129] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers130] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers131] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers132] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers133] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers134] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers135] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers136] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers137] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers138] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers139] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers140] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers141] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers142] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers143] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers144] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers145] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers146] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers147] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers148] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links02.int_node.port_buffers149] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.ext_links03] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl1 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=3 +weight=1 + +[system.ruby.network.ext_links04] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl2 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=4 +weight=1 + +[system.ruby.network.ext_links05] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl3 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=5 +weight=1 + +[system.ruby.network.ext_links06] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl4 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=6 +weight=1 + +[system.ruby.network.ext_links07] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl5 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=7 +weight=1 + +[system.ruby.network.ext_links08] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl6 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=8 +weight=1 + +[system.ruby.network.ext_links09] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcp_cntrl7 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=9 +weight=1 + +[system.ruby.network.ext_links10] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.sqc_cntrl0 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=10 +weight=1 + +[system.ruby.network.ext_links11] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.sqc_cntrl1 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=11 +weight=1 + +[system.ruby.network.ext_links12] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tcc_cntrl0 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=12 +weight=1 + +[system.ruby.network.ext_links13] +type=SimpleExtLink +bandwidth_factor=512 +eventq_index=0 +ext_node=system.tccdir_cntrl0 +int_node=system.ruby.network.ext_links02.int_node +latency=1 +link_id=13 +weight=1 + +[system.ruby.network.int_link_buffers00] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers01] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers02] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers03] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers04] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers05] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers06] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers07] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers08] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers09] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers10] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers11] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers12] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers13] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers14] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers15] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers16] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers17] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers18] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers19] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers20] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers21] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers22] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers23] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers24] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers25] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers26] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers27] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers28] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers29] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers30] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers31] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers32] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers33] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers34] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers35] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers36] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers37] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers38] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_link_buffers39] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.ruby.network.int_links0] +type=SimpleIntLink +bandwidth_factor=512 +eventq_index=0 +latency=1 +link_id=0 +node_a=system.ruby.network.ext_links00.int_node +node_b=system.ruby.network.ext_links01.int_node +weight=1 + +[system.ruby.network.int_links1] +type=SimpleIntLink +bandwidth_factor=512 +eventq_index=0 +latency=1 +link_id=1 +node_a=system.ruby.network.ext_links00.int_node +node_b=system.ruby.network.ext_links02.int_node +weight=1 + +[system.sqc_cntrl0] +type=SQC_Controller +children=L1cache mandatoryQueue probeToSQC requestFromSQC responseFromSQC responseToSQC sequencer unblockFromCore +L1cache=system.sqc_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=80 +l2_hit_latency=18 +mandatoryQueue=system.sqc_cntrl0.mandatoryQueue +number_of_TBEs=256 +probeToSQC=system.sqc_cntrl0.probeToSQC +recycle_latency=10 +requestFromSQC=system.sqc_cntrl0.requestFromSQC +responseFromSQC=system.sqc_cntrl0.responseFromSQC +responseToSQC=system.sqc_cntrl0.responseToSQC +ruby_system=system.ruby +sequencer=system.sqc_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.sqc_cntrl0.unblockFromCore +version=0 + +[system.sqc_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=4 + +[system.sqc_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=32768 + +[system.sqc_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.sqc_cntrl0.probeToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[21] + +[system.sqc_cntrl0.requestFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[29] + +[system.sqc_cntrl0.responseFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[30] + +[system.sqc_cntrl0.responseToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[22] + +[system.sqc_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.sqc_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.sqc_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=16 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=false +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=true +version=18 +slave=system.cpu.cpuInstPort[0] + +[system.sqc_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[31] + +[system.sqc_cntrl1] +type=SQC_Controller +children=L1cache mandatoryQueue probeToSQC requestFromSQC responseFromSQC responseToSQC sequencer unblockFromCore +L1cache=system.sqc_cntrl1.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +issue_latency=80 +l2_hit_latency=18 +mandatoryQueue=system.sqc_cntrl1.mandatoryQueue +number_of_TBEs=256 +probeToSQC=system.sqc_cntrl1.probeToSQC +recycle_latency=10 +requestFromSQC=system.sqc_cntrl1.requestFromSQC +responseFromSQC=system.sqc_cntrl1.responseFromSQC +responseToSQC=system.sqc_cntrl1.responseToSQC +ruby_system=system.ruby +sequencer=system.sqc_cntrl1.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.sqc_cntrl1.unblockFromCore +version=1 + +[system.sqc_cntrl1.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.sqc_cntrl1.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=32768 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=4 + +[system.sqc_cntrl1.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=32768 + +[system.sqc_cntrl1.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.sqc_cntrl1.probeToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[23] + +[system.sqc_cntrl1.requestFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[32] + +[system.sqc_cntrl1.responseFromSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[33] + +[system.sqc_cntrl1.responseToSQC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[24] + +[system.sqc_cntrl1.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.sqc_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.sqc_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=16 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=false +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=true +version=19 +slave=system.cpu.cpuInstPort[1] + +[system.sqc_cntrl1.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[34] + +[system.sys_port_proxy] +type=RubyPortProxy +clk_domain=system.clk_domain +eventq_index=0 +is_cpu_sequencer=true +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_ruby_tester=false +version=0 +slave=system.system_port + +[system.tcc_cntrl0] +type=TCC_Controller +children=L2cache responseFromTCC responseToTCC w_TCCUnblockToTCCDir w_probeToTCC w_reqToTCC w_reqToTCCDir w_respToTCC w_respToTCCDir +L2cache=system.tcc_cntrl0.L2cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +eventq_index=0 +l2_request_latency=1 +l2_response_latency=16 +number_of_TBEs=2048 +recycle_latency=10 +responseFromTCC=system.tcc_cntrl0.responseFromTCC +responseToTCC=system.tcc_cntrl0.responseToTCC +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +version=0 +w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir +w_probeToTCC=system.tcc_cntrl0.w_probeToTCC +w_reqToTCC=system.tcc_cntrl0.w_reqToTCC +w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir +w_respToTCC=system.tcc_cntrl0.w_respToTCC +w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir + +[system.tcc_cntrl0.L2cache] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=8 +dataArrayBanks=256 +eventq_index=0 +is_icache=false +replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=262144.0 +start_index_bit=6 +tagAccessLatency=2 +tagArrayBanks=256 + +[system.tcc_cntrl0.L2cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=262144.0 + +[system.tcc_cntrl0.responseFromTCC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[35] + +[system.tcc_cntrl0.responseToTCC] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[25] + +[system.tcc_cntrl0.w_TCCUnblockToTCCDir] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_probeToTCC] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_reqToTCC] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_reqToTCCDir] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_respToTCC] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tcc_cntrl0.w_respToTCCDir] +type=RubyWireBuffer +eventq_index=0 +ruby_system=system.ruby + +[system.tccdir_cntrl0] +type=TCCdir_Controller +children=directory probeFromNB probeToCore requestFromTCP requestToNB responseFromNB responseFromTCP responseToCore responseToNB triggerQueue unblockFromTCP unblockToNB +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +directory=system.tccdir_cntrl0.directory +directory_latency=6 +eventq_index=0 +issue_latency=120 +number_of_TBEs=1024 +probeFromNB=system.tccdir_cntrl0.probeFromNB +probeToCore=system.tccdir_cntrl0.probeToCore +recycle_latency=10 +requestFromTCP=system.tccdir_cntrl0.requestFromTCP +requestToNB=system.tccdir_cntrl0.requestToNB +responseFromNB=system.tccdir_cntrl0.responseFromNB +responseFromTCP=system.tccdir_cntrl0.responseFromTCP +responseToCore=system.tccdir_cntrl0.responseToCore +responseToNB=system.tccdir_cntrl0.responseToNB +response_latency=5 +ruby_system=system.ruby +system=system +transitions_per_cycle=32 +triggerQueue=system.tccdir_cntrl0.triggerQueue +unblockFromTCP=system.tccdir_cntrl0.unblockFromTCP +unblockToNB=system.tccdir_cntrl0.unblockToNB +version=0 +w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir +w_probeToTCC=system.tcc_cntrl0.w_probeToTCC +w_reqToTCC=system.tcc_cntrl0.w_reqToTCC +w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir +w_respToTCC=system.tcc_cntrl0.w_respToTCC +w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir + +[system.tccdir_cntrl0.directory] +type=RubyCache +children=replacement_policy +assoc=16 +block_size=0 +dataAccessLatency=1 +dataArrayBanks=1 +eventq_index=0 +is_icache=false +replacement_policy=system.tccdir_cntrl0.directory.replacement_policy +resourceStalls=false +ruby_system=system.ruby +size=786432 +start_index_bit=6 +tagAccessLatency=1 +tagArrayBanks=1 + +[system.tccdir_cntrl0.directory.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=16 +block_size=64 +eventq_index=0 +size=786432 + +[system.tccdir_cntrl0.probeFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[29] + +[system.tccdir_cntrl0.probeToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[36] + +[system.tccdir_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[26] + +[system.tccdir_cntrl0.requestToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[38] + +[system.tccdir_cntrl0.responseFromNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +slave=system.ruby.network.master[30] + +[system.tccdir_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[27] + +[system.tccdir_cntrl0.responseToCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[37] + +[system.tccdir_cntrl0.responseToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[39] + +[system.tccdir_cntrl0.triggerQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false + +[system.tccdir_cntrl0.unblockFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[28] + +[system.tccdir_cntrl0.unblockToNB] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false +master=system.ruby.network.slave[40] + +[system.tcp_cntrl0] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl0.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl0.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl0.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl0.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl0.requestFromTCP +responseFromTCP=system.tcp_cntrl0.responseFromTCP +responseToTCP=system.tcp_cntrl0.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl0.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl0.unblockFromCore +use_seq_not_coal=false +version=0 + +[system.tcp_cntrl0.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl0.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl0.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=2 +slave=system.cpu.cpuDataPort[0] + +[system.tcp_cntrl0.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl0.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[5] + +[system.tcp_cntrl0.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[5] + +[system.tcp_cntrl0.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[6] + +[system.tcp_cntrl0.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[6] + +[system.tcp_cntrl0.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl0.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl0.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=3 + +[system.tcp_cntrl0.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[7] + +[system.tcp_cntrl1] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl1.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl1.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl1.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl1.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl1.requestFromTCP +responseFromTCP=system.tcp_cntrl1.responseFromTCP +responseToTCP=system.tcp_cntrl1.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl1.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl1.unblockFromCore +use_seq_not_coal=false +version=1 + +[system.tcp_cntrl1.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl1.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl1.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=4 +slave=system.cpu.cpuDataPort[1] + +[system.tcp_cntrl1.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl1.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[7] + +[system.tcp_cntrl1.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[8] + +[system.tcp_cntrl1.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[9] + +[system.tcp_cntrl1.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[8] + +[system.tcp_cntrl1.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl1.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl1.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=5 + +[system.tcp_cntrl1.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[10] + +[system.tcp_cntrl2] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl2.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl2.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl2.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl2.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl2.requestFromTCP +responseFromTCP=system.tcp_cntrl2.responseFromTCP +responseToTCP=system.tcp_cntrl2.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl2.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl2.unblockFromCore +use_seq_not_coal=false +version=2 + +[system.tcp_cntrl2.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl2.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl2.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl2.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl2.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl2.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=6 +slave=system.cpu.cpuDataPort[2] + +[system.tcp_cntrl2.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl2.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[9] + +[system.tcp_cntrl2.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[11] + +[system.tcp_cntrl2.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[12] + +[system.tcp_cntrl2.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[10] + +[system.tcp_cntrl2.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl2.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl2.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=7 + +[system.tcp_cntrl2.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[13] + +[system.tcp_cntrl3] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl3.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl3.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl3.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl3.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl3.requestFromTCP +responseFromTCP=system.tcp_cntrl3.responseFromTCP +responseToTCP=system.tcp_cntrl3.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl3.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl3.unblockFromCore +use_seq_not_coal=false +version=3 + +[system.tcp_cntrl3.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl3.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl3.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl3.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl3.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl3.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=8 +slave=system.cpu.cpuDataPort[3] + +[system.tcp_cntrl3.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl3.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[11] + +[system.tcp_cntrl3.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[14] + +[system.tcp_cntrl3.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[15] + +[system.tcp_cntrl3.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[12] + +[system.tcp_cntrl3.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl3.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl3.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=9 + +[system.tcp_cntrl3.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[16] + +[system.tcp_cntrl4] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl4.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl4.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl4.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl4.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl4.requestFromTCP +responseFromTCP=system.tcp_cntrl4.responseFromTCP +responseToTCP=system.tcp_cntrl4.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl4.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl4.unblockFromCore +use_seq_not_coal=false +version=4 + +[system.tcp_cntrl4.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl4.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl4.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl4.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl4.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl4.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=10 +slave=system.cpu.cpuDataPort[4] + +[system.tcp_cntrl4.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl4.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[13] + +[system.tcp_cntrl4.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[17] + +[system.tcp_cntrl4.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[18] + +[system.tcp_cntrl4.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[14] + +[system.tcp_cntrl4.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl4.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl4.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=11 + +[system.tcp_cntrl4.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[19] + +[system.tcp_cntrl5] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl5.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl5.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl5.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl5.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl5.requestFromTCP +responseFromTCP=system.tcp_cntrl5.responseFromTCP +responseToTCP=system.tcp_cntrl5.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl5.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl5.unblockFromCore +use_seq_not_coal=false +version=5 + +[system.tcp_cntrl5.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl5.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl5.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl5.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl5.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl5.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=12 +slave=system.cpu.cpuDataPort[5] + +[system.tcp_cntrl5.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl5.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[15] + +[system.tcp_cntrl5.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[20] + +[system.tcp_cntrl5.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[21] + +[system.tcp_cntrl5.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[16] + +[system.tcp_cntrl5.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl5.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl5.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=13 + +[system.tcp_cntrl5.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[22] + +[system.tcp_cntrl6] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl6.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl6.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl6.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl6.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl6.requestFromTCP +responseFromTCP=system.tcp_cntrl6.responseFromTCP +responseToTCP=system.tcp_cntrl6.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl6.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl6.unblockFromCore +use_seq_not_coal=false +version=6 + +[system.tcp_cntrl6.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl6.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl6.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl6.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl6.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl6.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=14 +slave=system.cpu.cpuDataPort[6] + +[system.tcp_cntrl6.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl6.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[17] + +[system.tcp_cntrl6.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[23] + +[system.tcp_cntrl6.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[24] + +[system.tcp_cntrl6.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[18] + +[system.tcp_cntrl6.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl6.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl6.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=15 + +[system.tcp_cntrl6.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[25] + +[system.tcp_cntrl7] +type=TCP_Controller +children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore +L1cache=system.tcp_cntrl7.L1cache +TCC_select_num_bits=0 +buffer_size=0 +clk_domain=system.clk_domain +cluster_id=0 +coalescer=system.tcp_cntrl7.coalescer +eventq_index=0 +issue_latency=40 +l2_hit_latency=18 +mandatoryQueue=system.tcp_cntrl7.mandatoryQueue +number_of_TBEs=2560 +probeToTCP=system.tcp_cntrl7.probeToTCP +recycle_latency=10 +requestFromTCP=system.tcp_cntrl7.requestFromTCP +responseFromTCP=system.tcp_cntrl7.responseFromTCP +responseToTCP=system.tcp_cntrl7.responseToTCP +ruby_system=system.ruby +sequencer=system.tcp_cntrl7.sequencer +system=system +transitions_per_cycle=32 +unblockFromCore=system.tcp_cntrl7.unblockFromCore +use_seq_not_coal=false +version=7 + +[system.tcp_cntrl7.L1cache] +type=RubyCache +children=replacement_policy +assoc=8 +block_size=0 +dataAccessLatency=4 +dataArrayBanks=16 +eventq_index=0 +is_icache=false +replacement_policy=system.tcp_cntrl7.L1cache.replacement_policy +resourceStalls=true +ruby_system=system.ruby +size=16384 +start_index_bit=6 +tagAccessLatency=4 +tagArrayBanks=4 + +[system.tcp_cntrl7.L1cache.replacement_policy] +type=PseudoLRUReplacementPolicy +assoc=8 +block_size=64 +eventq_index=0 +size=16384 + +[system.tcp_cntrl7.coalescer] +type=RubyGPUCoalescer +assume_rfo=true +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl7.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl7.L1cache +icache_hit_latency=1 +is_cpu_sequencer=false +max_outstanding_requests=2560 +no_retry_on_stall=true +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=false +system=system +using_network_tester=false +using_ruby_tester=true +version=16 +slave=system.cpu.cpuDataPort[7] + +[system.tcp_cntrl7.mandatoryQueue] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=false +randomization=false + +[system.tcp_cntrl7.probeToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[19] + +[system.tcp_cntrl7.requestFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[26] + +[system.tcp_cntrl7.responseFromTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[27] + +[system.tcp_cntrl7.responseToTCP] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +slave=system.ruby.network.master[20] + +[system.tcp_cntrl7.sequencer] +type=RubySequencer +clk_domain=system.clk_domain +coreid=99 +dcache=system.tcp_cntrl7.L1cache +dcache_hit_latency=1 +deadlock_threshold=500000 +eventq_index=0 +icache=system.tcp_cntrl7.L1cache +icache_hit_latency=1 +is_cpu_sequencer=true +max_outstanding_requests=16 +no_retry_on_stall=false +ruby_system=system.ruby +support_data_reqs=true +support_inst_reqs=true +system=system +using_network_tester=false +using_ruby_tester=false +version=17 + +[system.tcp_cntrl7.unblockFromCore] +type=MessageBuffer +buffer_size=0 +eventq_index=0 +ordered=true +randomization=false +master=system.ruby.network.slave[28] + +[system.voltage_domain] +type=VoltageDomain +eventq_index=0 +voltage=1.000000 + diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simerr b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simerr new file mode 100755 index 000000000..13060c953 --- /dev/null +++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simerr @@ -0,0 +1,10 @@ +warn: system.ruby.network adopting orphan SimObject param 'int_links' +warn: system.ruby.network adopting orphan SimObject param 'ext_links' +warn: rounding error > tolerance + 1.250000 rounded to 1 +warn: rounding error > tolerance + 1.250000 rounded to 1 +warn: rounding error > tolerance + 1.250000 rounded to 1 +warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (256 Mbytes) +warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files! diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simout b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simout new file mode 100755 index 000000000..62d7346d7 --- /dev/null +++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simout @@ -0,0 +1,11 @@ +gem5 Simulator System. http://gem5.org +gem5 is copyrighted software; use the --copyright option for details. + +gem5 compiled Jan 19 2016 13:28:55 +gem5 started Jan 19 2016 13:29:16 +gem5 executing on zizzer, pid 48851 +command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/60.gpu-randomtest/x86/linux/gpu-randomtest-ruby-GPU_RfO -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/60.gpu-randomtest/x86/linux/gpu-randomtest-ruby-GPU_RfO + +Global frequency set at 1000000000 ticks per second +info: Entering event queue @ 0. Starting simulation... +Exiting @ tick 14181 because Ruby Tester completed diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/stats.txt b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/stats.txt new file mode 100644 index 000000000..75065fd02 --- /dev/null +++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/stats.txt @@ -0,0 +1,1072 @@ + +---------- Begin Simulation Statistics ---------- +sim_seconds 0.000014 # Number of seconds simulated +sim_ticks 14181 # Number of ticks simulated +final_tick 14181 # Number of ticks from beginning of simulation (restored from checkpoints and never reset) +sim_freq 1000000000 # Frequency of simulated ticks +host_tick_rate 88786 # Simulator tick rate (ticks/s) +host_mem_usage 463996 # Number of bytes of host memory used +host_seconds 0.16 # Real time elapsed on the host +system.voltage_domain.voltage 1 # Voltage in Volts +system.clk_domain.clock 1 # Clock period in ticks +system.mem_ctrls.bytes_read::dir_cntrl0 16576 # Number of bytes read from this memory +system.mem_ctrls.bytes_read::total 16576 # Number of bytes read from this memory +system.mem_ctrls.bytes_written::dir_cntrl0 576 # Number of bytes written to this memory +system.mem_ctrls.bytes_written::total 576 # Number of bytes written to this memory +system.mem_ctrls.num_reads::dir_cntrl0 259 # Number of read requests responded to by this memory +system.mem_ctrls.num_reads::total 259 # Number of read requests responded to by this memory +system.mem_ctrls.num_writes::dir_cntrl0 9 # Number of write requests responded to by this memory +system.mem_ctrls.num_writes::total 9 # Number of write requests responded to by this memory +system.mem_ctrls.bw_read::dir_cntrl0 1168887949 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_read::total 1168887949 # Total read bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_write::dir_cntrl0 40617728 # Write bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_write::total 40617728 # Write bandwidth from this memory (bytes/s) +system.mem_ctrls.bw_total::dir_cntrl0 1209505677 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.bw_total::total 1209505677 # Total bandwidth to/from this memory (bytes/s) +system.mem_ctrls.readReqs 259 # Number of read requests accepted +system.mem_ctrls.writeReqs 9 # Number of write requests accepted +system.mem_ctrls.readBursts 259 # Number of DRAM read bursts, including those serviced by the write queue +system.mem_ctrls.writeBursts 9 # Number of DRAM write bursts, including those merged in the write queue +system.mem_ctrls.bytesReadDRAM 15936 # Total number of bytes read from DRAM +system.mem_ctrls.bytesReadWrQ 640 # Total number of bytes read from write queue +system.mem_ctrls.bytesWritten 0 # Total number of bytes written to DRAM +system.mem_ctrls.bytesReadSys 16576 # Total read bytes from the system interface side +system.mem_ctrls.bytesWrittenSys 576 # Total written bytes from the system interface side +system.mem_ctrls.servicedByWrQ 10 # Number of DRAM read bursts serviced by the write queue +system.mem_ctrls.mergedWrBursts 0 # Number of DRAM write bursts merged with an existing one +system.mem_ctrls.neitherReadNorWriteReqs 0 # Number of requests that are neither read nor write +system.mem_ctrls.perBankRdBursts::0 100 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::1 71 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::2 66 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::3 12 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::4 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::5 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::6 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::7 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::8 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::9 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::10 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::11 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::12 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::13 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::14 0 # Per bank write bursts +system.mem_ctrls.perBankRdBursts::15 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::0 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::1 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::2 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::3 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::4 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::5 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::6 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::7 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::8 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::9 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::10 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::11 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::12 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::13 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::14 0 # Per bank write bursts +system.mem_ctrls.perBankWrBursts::15 0 # Per bank write bursts +system.mem_ctrls.numRdRetry 0 # Number of times read queue was full causing retry +system.mem_ctrls.numWrRetry 0 # Number of times write queue was full causing retry +system.mem_ctrls.totGap 13941 # Total gap between requests +system.mem_ctrls.readPktSize::0 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::1 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::2 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::3 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::4 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::5 0 # Read request sizes (log2) +system.mem_ctrls.readPktSize::6 259 # Read request sizes (log2) +system.mem_ctrls.writePktSize::0 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::1 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::2 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::3 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::4 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::5 0 # Write request sizes (log2) +system.mem_ctrls.writePktSize::6 9 # Write request sizes (log2) +system.mem_ctrls.rdQLenPdf::0 214 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::1 27 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::2 7 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::3 1 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::4 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::5 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::6 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::7 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::8 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::9 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::10 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::11 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::12 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::13 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::14 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::15 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::16 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::17 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::18 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::19 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::20 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::21 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::22 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::23 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::24 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::25 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::26 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::27 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::28 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::29 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::30 0 # What read queue length does an incoming req see +system.mem_ctrls.rdQLenPdf::31 0 # What read queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::0 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::1 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::2 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::3 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::4 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::5 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::6 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::7 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::8 1 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::9 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::10 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::11 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::12 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::13 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::14 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::15 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::16 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::17 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::18 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::19 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::20 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::21 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::22 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::23 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::24 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::25 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::26 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::27 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::28 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::29 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::30 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::31 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::32 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::33 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::34 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::35 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::36 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::37 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::38 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::39 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::40 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::41 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::42 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::43 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::44 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::45 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::46 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::47 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::48 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::49 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::50 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::51 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::52 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::53 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::54 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::55 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::56 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::57 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::58 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::59 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::60 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::61 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::62 0 # What write queue length does an incoming req see +system.mem_ctrls.wrQLenPdf::63 0 # What write queue length does an incoming req see +system.mem_ctrls.bytesPerActivate::samples 15 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::mean 913.066667 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::gmean 883.543279 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::stdev 210.139908 # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::512-639 3 20.00% 20.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::896-1023 1 6.67% 26.67% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::1024-1151 11 73.33% 100.00% # Bytes accessed per row activation +system.mem_ctrls.bytesPerActivate::total 15 # Bytes accessed per row activation +system.mem_ctrls.totQLat 973 # Total ticks spent queuing +system.mem_ctrls.totMemAccLat 5704 # Total ticks spent from burst creation until serviced by the DRAM +system.mem_ctrls.totBusLat 1245 # Total ticks spent in databus transfers +system.mem_ctrls.avgQLat 3.91 # Average queueing delay per DRAM burst +system.mem_ctrls.avgBusLat 5.00 # Average bus latency per DRAM burst +system.mem_ctrls.avgMemAccLat 22.91 # Average memory access latency per DRAM burst +system.mem_ctrls.avgRdBW 1123.76 # Average DRAM read bandwidth in MiByte/s +system.mem_ctrls.avgWrBW 0.00 # Average achieved write bandwidth in MiByte/s +system.mem_ctrls.avgRdBWSys 1168.89 # Average system read bandwidth in MiByte/s +system.mem_ctrls.avgWrBWSys 40.62 # Average system write bandwidth in MiByte/s +system.mem_ctrls.peakBW 12800.00 # Theoretical peak bandwidth in MiByte/s +system.mem_ctrls.busUtil 8.78 # Data bus utilization in percentage +system.mem_ctrls.busUtilRead 8.78 # Data bus utilization in percentage for reads +system.mem_ctrls.busUtilWrite 0.00 # Data bus utilization in percentage for writes +system.mem_ctrls.avgRdQLen 1.17 # Average read queue length when enqueuing +system.mem_ctrls.avgWrQLen 2.63 # Average write queue length when enqueuing +system.mem_ctrls.readRowHits 230 # Number of row buffer hits during reads +system.mem_ctrls.writeRowHits 0 # Number of row buffer hits during writes +system.mem_ctrls.readRowHitRate 92.37 # Row buffer hit rate for reads +system.mem_ctrls.writeRowHitRate 0.00 # Row buffer hit rate for writes +system.mem_ctrls.avgGap 52.02 # Average gap between requests +system.mem_ctrls.pageHitRate 89.15 # Row buffer hit rate, read and write combined +system.mem_ctrls_0.actEnergy 83160 # Energy for activate commands per rank (pJ) +system.mem_ctrls_0.preEnergy 46200 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_0.readEnergy 1872000 # Energy for read commands per rank (pJ) +system.mem_ctrls_0.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_0.refreshEnergy 508560 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_0.actBackEnergy 5437116 # Energy for active background per rank (pJ) +system.mem_ctrls_0.preBackEnergy 58200 # Energy for precharge background per rank (pJ) +system.mem_ctrls_0.totalEnergy 8005236 # Total energy per rank (pJ) +system.mem_ctrls_0.averagePower 994.933632 # Core power per rank (mW) +system.mem_ctrls_0.memoryStateTime::IDLE 83 # Time in different power states +system.mem_ctrls_0.memoryStateTime::REF 260 # Time in different power states +system.mem_ctrls_0.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT 7717 # Time in different power states +system.mem_ctrls_0.memoryStateTime::ACT_PDN 0 # Time in different power states +system.mem_ctrls_1.actEnergy 0 # Energy for activate commands per rank (pJ) +system.mem_ctrls_1.preEnergy 0 # Energy for precharge commands per rank (pJ) +system.mem_ctrls_1.readEnergy 0 # Energy for read commands per rank (pJ) +system.mem_ctrls_1.writeEnergy 0 # Energy for write commands per rank (pJ) +system.mem_ctrls_1.refreshEnergy 508560 # Energy for refresh commands per rank (pJ) +system.mem_ctrls_1.actBackEnergy 168264 # Energy for active background per rank (pJ) +system.mem_ctrls_1.preBackEnergy 4671600 # Energy for precharge background per rank (pJ) +system.mem_ctrls_1.totalEnergy 5348424 # Total energy per rank (pJ) +system.mem_ctrls_1.averagePower 665.889442 # Core power per rank (mW) +system.mem_ctrls_1.memoryStateTime::IDLE 7786 # Time in different power states +system.mem_ctrls_1.memoryStateTime::REF 260 # Time in different power states +system.mem_ctrls_1.memoryStateTime::PRE_PDN 0 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT 0 # Time in different power states +system.mem_ctrls_1.memoryStateTime::ACT_PDN 0 # Time in different power states +system.ruby.clk_domain.clock 1 # Clock period in ticks +system.ruby.outstanding_req_hist::bucket_size 2 +system.ruby.outstanding_req_hist::max_bucket 19 +system.ruby.outstanding_req_hist::samples 63 +system.ruby.outstanding_req_hist::mean 12.920635 +system.ruby.outstanding_req_hist::gmean 11.694862 +system.ruby.outstanding_req_hist::stdev 4.228557 +system.ruby.outstanding_req_hist | 1 1.59% 1.59% | 2 3.17% 4.76% | 2 3.17% 7.94% | 5 7.94% 15.87% | 4 6.35% 22.22% | 3 4.76% 26.98% | 5 7.94% 34.92% | 14 22.22% 57.14% | 27 42.86% 100.00% | 0 0.00% 100.00% +system.ruby.outstanding_req_hist::total 63 +system.ruby.latency_hist::bucket_size 1024 +system.ruby.latency_hist::max_bucket 10239 +system.ruby.latency_hist::samples 48 +system.ruby.latency_hist::mean 3351.354167 +system.ruby.latency_hist::gmean 1865.352879 +system.ruby.latency_hist::stdev 1934.275107 +system.ruby.latency_hist | 11 22.92% 22.92% | 3 6.25% 29.17% | 3 6.25% 35.42% | 7 14.58% 50.00% | 18 37.50% 87.50% | 6 12.50% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.latency_hist::total 48 +system.ruby.hit_latency_hist::bucket_size 1024 +system.ruby.hit_latency_hist::max_bucket 10239 +system.ruby.hit_latency_hist::samples 42 +system.ruby.hit_latency_hist::mean 3684.428571 +system.ruby.hit_latency_hist::gmean 2778.454716 +system.ruby.hit_latency_hist::stdev 1783.107224 +system.ruby.hit_latency_hist | 7 16.67% 16.67% | 3 7.14% 23.81% | 1 2.38% 26.19% | 7 16.67% 42.86% | 18 42.86% 85.71% | 6 14.29% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.hit_latency_hist::total 42 +system.ruby.miss_latency_hist::bucket_size 512 +system.ruby.miss_latency_hist::max_bucket 5119 +system.ruby.miss_latency_hist::samples 6 +system.ruby.miss_latency_hist::mean 1019.833333 +system.ruby.miss_latency_hist::gmean 114.673945 +system.ruby.miss_latency_hist::stdev 1281.644790 +system.ruby.miss_latency_hist | 3 50.00% 50.00% | 1 16.67% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 2 33.33% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.miss_latency_hist::total 6 +system.ruby.L1Cache.incomplete_times 6 +system.cp_cntrl0.L1D0cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D0cache.demand_misses 45 # Number of cache demand misses +system.cp_cntrl0.L1D0cache.demand_accesses 45 # Number of cache demand accesses +system.cp_cntrl0.L1D0cache.num_data_array_writes 43 # number of data array writes +system.cp_cntrl0.L1D0cache.num_tag_array_reads 154 # number of tag array reads +system.cp_cntrl0.L1D0cache.num_tag_array_writes 41 # number of tag array writes +system.cp_cntrl0.L1D1cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1D1cache.demand_misses 43 # Number of cache demand misses +system.cp_cntrl0.L1D1cache.demand_accesses 43 # Number of cache demand accesses +system.cp_cntrl0.L1D1cache.num_data_array_writes 41 # number of data array writes +system.cp_cntrl0.L1D1cache.num_tag_array_reads 73 # number of tag array reads +system.cp_cntrl0.L1D1cache.num_tag_array_writes 41 # number of tag array writes +system.cp_cntrl0.L1Icache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L1Icache.demand_misses 3 # Number of cache demand misses +system.cp_cntrl0.L1Icache.demand_accesses 3 # Number of cache demand accesses +system.cp_cntrl0.L1Icache.num_tag_array_reads 3 # number of tag array reads +system.cp_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.cp_cntrl0.L2cache.demand_misses 91 # Number of cache demand misses +system.cp_cntrl0.L2cache.demand_accesses 91 # Number of cache demand accesses +system.cp_cntrl0.L2cache.num_data_array_reads 81 # number of data array reads +system.cp_cntrl0.L2cache.num_data_array_writes 84 # number of data array writes +system.cp_cntrl0.L2cache.num_tag_array_reads 380 # number of tag array reads +system.cp_cntrl0.L2cache.num_tag_array_writes 371 # number of tag array writes +system.cp_cntrl0.sequencer.store_waiting_on_load 2 # Number of times a store aliased with a pending load +system.cp_cntrl0.sequencer.store_waiting_on_store 3 # Number of times a store aliased with a pending store +system.cp_cntrl0.sequencer1.store_waiting_on_load 1 # Number of times a store aliased with a pending load +system.cp_cntrl0.sequencer1.store_waiting_on_store 4 # Number of times a store aliased with a pending store +system.cp_cntrl0.fully_busy_cycles 2 # cycles for which number of transistions == max transitions +system.dir_cntrl0.L3CacheMemory.demand_hits 0 # Number of cache demand hits +system.dir_cntrl0.L3CacheMemory.demand_misses 0 # Number of cache demand misses +system.dir_cntrl0.L3CacheMemory.demand_accesses 0 # Number of cache demand accesses +system.dir_cntrl0.L3CacheMemory.num_data_array_writes 374 # number of data array writes +system.dir_cntrl0.L3CacheMemory.num_tag_array_reads 378 # number of tag array reads +system.dir_cntrl0.L3CacheMemory.num_tag_array_writes 378 # number of tag array writes +system.dir_cntrl0.L3CacheMemory.num_tag_array_stalls 10169 # number of stalls caused by tag array +system.dir_cntrl0.L3CacheMemory.num_data_array_stalls 5502 # number of stalls caused by data array +system.ruby.network.ext_links00.int_node.percent_links_utilized 0.199210 +system.ruby.network.ext_links00.int_node.msg_count.Control::0 308 +system.ruby.network.ext_links00.int_node.msg_count.Request_Control::0 385 +system.ruby.network.ext_links00.int_node.msg_count.Response_Data::2 393 +system.ruby.network.ext_links00.int_node.msg_count.Response_Control::2 227 +system.ruby.network.ext_links00.int_node.msg_count.Writeback_Data::2 66 +system.ruby.network.ext_links00.int_node.msg_count.Writeback_Control::2 70 +system.ruby.network.ext_links00.int_node.msg_count.Unblock_Control::4 303 +system.ruby.network.ext_links00.int_node.msg_bytes.Control::0 2464 +system.ruby.network.ext_links00.int_node.msg_bytes.Request_Control::0 3080 +system.ruby.network.ext_links00.int_node.msg_bytes.Response_Data::2 28296 +system.ruby.network.ext_links00.int_node.msg_bytes.Response_Control::2 1816 +system.ruby.network.ext_links00.int_node.msg_bytes.Writeback_Data::2 4752 +system.ruby.network.ext_links00.int_node.msg_bytes.Writeback_Control::2 560 +system.ruby.network.ext_links00.int_node.msg_bytes.Unblock_Control::4 2424 +system.ruby.network.ext_links01.int_node.percent_links_utilized 0.120981 +system.ruby.network.ext_links01.int_node.msg_count.Control::0 227 +system.ruby.network.ext_links01.int_node.msg_count.Request_Control::0 153 +system.ruby.network.ext_links01.int_node.msg_count.Response_Data::2 95 +system.ruby.network.ext_links01.int_node.msg_count.Response_Control::2 217 +system.ruby.network.ext_links01.int_node.msg_count.Writeback_Data::2 66 +system.ruby.network.ext_links01.int_node.msg_count.Writeback_Control::2 70 +system.ruby.network.ext_links01.int_node.msg_count.Unblock_Control::4 80 +system.ruby.network.ext_links01.int_node.msg_bytes.Control::0 1816 +system.ruby.network.ext_links01.int_node.msg_bytes.Request_Control::0 1224 +system.ruby.network.ext_links01.int_node.msg_bytes.Response_Data::2 6840 +system.ruby.network.ext_links01.int_node.msg_bytes.Response_Control::2 1736 +system.ruby.network.ext_links01.int_node.msg_bytes.Writeback_Data::2 4752 +system.ruby.network.ext_links01.int_node.msg_bytes.Writeback_Control::2 560 +system.ruby.network.ext_links01.int_node.msg_bytes.Unblock_Control::4 640 +system.tcp_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl0.L1cache.num_data_array_reads 14 # number of data array reads +system.tcp_cntrl0.L1cache.num_data_array_writes 116 # number of data array writes +system.tcp_cntrl0.L1cache.num_tag_array_reads 314 # number of tag array reads +system.tcp_cntrl0.L1cache.num_tag_array_writes 305 # number of tag array writes +system.tcp_cntrl0.L1cache.num_tag_array_stalls 38 # number of stalls caused by tag array +system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers 5 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.gpu_tcp_st_hits 9 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers 79 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.gpu_st_misses 21 # stores that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl0.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl0.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl0.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl0.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl0.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl0.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.ruby.network.ext_links02.int_node.percent_links_utilized 0.173894 +system.ruby.network.ext_links02.int_node.msg_count.Control::0 81 +system.ruby.network.ext_links02.int_node.msg_count.Control::1 814 +system.ruby.network.ext_links02.int_node.msg_count.Request_Control::0 232 +system.ruby.network.ext_links02.int_node.msg_count.Request_Control::1 846 +system.ruby.network.ext_links02.int_node.msg_count.Response_Data::2 298 +system.ruby.network.ext_links02.int_node.msg_count.Response_Data::3 1644 +system.ruby.network.ext_links02.int_node.msg_count.Response_Control::2 10 +system.ruby.network.ext_links02.int_node.msg_count.Response_Control::3 2 +system.ruby.network.ext_links02.int_node.msg_count.Unblock_Control::4 223 +system.ruby.network.ext_links02.int_node.msg_count.Unblock_Control::5 831 +system.ruby.network.ext_links02.int_node.msg_bytes.Control::0 648 +system.ruby.network.ext_links02.int_node.msg_bytes.Control::1 6512 +system.ruby.network.ext_links02.int_node.msg_bytes.Request_Control::0 1856 +system.ruby.network.ext_links02.int_node.msg_bytes.Request_Control::1 6768 +system.ruby.network.ext_links02.int_node.msg_bytes.Response_Data::2 21456 +system.ruby.network.ext_links02.int_node.msg_bytes.Response_Data::3 118368 +system.ruby.network.ext_links02.int_node.msg_bytes.Response_Control::2 80 +system.ruby.network.ext_links02.int_node.msg_bytes.Response_Control::3 16 +system.ruby.network.ext_links02.int_node.msg_bytes.Unblock_Control::4 1784 +system.ruby.network.ext_links02.int_node.msg_bytes.Unblock_Control::5 6648 +system.tcp_cntrl1.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl1.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl1.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl1.L1cache.num_data_array_reads 10 # number of data array reads +system.tcp_cntrl1.L1cache.num_data_array_writes 108 # number of data array writes +system.tcp_cntrl1.L1cache.num_tag_array_reads 300 # number of tag array reads +system.tcp_cntrl1.L1cache.num_tag_array_writes 289 # number of tag array writes +system.tcp_cntrl1.L1cache.num_tag_array_stalls 44 # number of stalls caused by tag array +system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits 1 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers 4 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_ld_misses 1 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.gpu_tcp_st_hits 9 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers 74 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.gpu_st_misses 20 # stores that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl1.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl1.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl1.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl1.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl1.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl1.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.tcp_cntrl2.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl2.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl2.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl2.L1cache.num_data_array_reads 19 # number of data array reads +system.tcp_cntrl2.L1cache.num_data_array_writes 108 # number of data array writes +system.tcp_cntrl2.L1cache.num_tag_array_reads 302 # number of tag array reads +system.tcp_cntrl2.L1cache.num_tag_array_writes 292 # number of tag array writes +system.tcp_cntrl2.L1cache.num_tag_array_stalls 36 # number of stalls caused by tag array +system.tcp_cntrl2.L1cache.num_data_array_stalls 3 # number of stalls caused by data array +system.tcp_cntrl2.coalescer.gpu_tcp_ld_hits 1 # loads that hit in the TCP +system.tcp_cntrl2.coalescer.gpu_tcp_ld_transfers 9 # TCP to TCP load transfers +system.tcp_cntrl2.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl2.coalescer.gpu_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl2.coalescer.gpu_tcp_st_hits 7 # stores that hit in the TCP +system.tcp_cntrl2.coalescer.gpu_tcp_st_transfers 72 # TCP to TCP store transfers +system.tcp_cntrl2.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl2.coalescer.gpu_st_misses 18 # stores that miss in the GPU +system.tcp_cntrl2.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl2.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl2.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl2.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl2.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl2.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl2.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl2.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.tcp_cntrl3.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl3.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl3.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl3.L1cache.num_data_array_reads 7 # number of data array reads +system.tcp_cntrl3.L1cache.num_data_array_writes 104 # number of data array writes +system.tcp_cntrl3.L1cache.num_tag_array_reads 272 # number of tag array reads +system.tcp_cntrl3.L1cache.num_tag_array_writes 262 # number of tag array writes +system.tcp_cntrl3.L1cache.num_tag_array_stalls 16 # number of stalls caused by tag array +system.tcp_cntrl3.L1cache.num_data_array_stalls 3 # number of stalls caused by data array +system.tcp_cntrl3.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl3.coalescer.gpu_tcp_ld_transfers 13 # TCP to TCP load transfers +system.tcp_cntrl3.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl3.coalescer.gpu_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl3.coalescer.gpu_tcp_st_hits 10 # stores that hit in the TCP +system.tcp_cntrl3.coalescer.gpu_tcp_st_transfers 63 # TCP to TCP store transfers +system.tcp_cntrl3.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl3.coalescer.gpu_st_misses 18 # stores that miss in the GPU +system.tcp_cntrl3.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl3.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl3.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl3.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl3.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl3.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl3.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl3.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.tcp_cntrl4.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl4.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl4.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl4.L1cache.num_data_array_reads 14 # number of data array reads +system.tcp_cntrl4.L1cache.num_data_array_writes 115 # number of data array writes +system.tcp_cntrl4.L1cache.num_tag_array_reads 317 # number of tag array reads +system.tcp_cntrl4.L1cache.num_tag_array_writes 309 # number of tag array writes +system.tcp_cntrl4.L1cache.num_tag_array_stalls 29 # number of stalls caused by tag array +system.tcp_cntrl4.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl4.coalescer.gpu_tcp_ld_transfers 4 # TCP to TCP load transfers +system.tcp_cntrl4.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl4.coalescer.gpu_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl4.coalescer.gpu_tcp_st_hits 6 # stores that hit in the TCP +system.tcp_cntrl4.coalescer.gpu_tcp_st_transfers 76 # TCP to TCP store transfers +system.tcp_cntrl4.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl4.coalescer.gpu_st_misses 26 # stores that miss in the GPU +system.tcp_cntrl4.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl4.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl4.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl4.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl4.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl4.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl4.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl4.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.tcp_cntrl5.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl5.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl5.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl5.L1cache.num_data_array_reads 10 # number of data array reads +system.tcp_cntrl5.L1cache.num_data_array_writes 107 # number of data array writes +system.tcp_cntrl5.L1cache.num_tag_array_reads 295 # number of tag array reads +system.tcp_cntrl5.L1cache.num_tag_array_writes 287 # number of tag array writes +system.tcp_cntrl5.L1cache.num_tag_array_stalls 31 # number of stalls caused by tag array +system.tcp_cntrl5.coalescer.gpu_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl5.coalescer.gpu_tcp_ld_transfers 6 # TCP to TCP load transfers +system.tcp_cntrl5.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl5.coalescer.gpu_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl5.coalescer.gpu_tcp_st_hits 8 # stores that hit in the TCP +system.tcp_cntrl5.coalescer.gpu_tcp_st_transfers 69 # TCP to TCP store transfers +system.tcp_cntrl5.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl5.coalescer.gpu_st_misses 23 # stores that miss in the GPU +system.tcp_cntrl5.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl5.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl5.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl5.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl5.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl5.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl5.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl5.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.tcp_cntrl6.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl6.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl6.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl6.L1cache.num_data_array_reads 13 # number of data array reads +system.tcp_cntrl6.L1cache.num_data_array_writes 123 # number of data array writes +system.tcp_cntrl6.L1cache.num_tag_array_reads 342 # number of tag array reads +system.tcp_cntrl6.L1cache.num_tag_array_writes 335 # number of tag array writes +system.tcp_cntrl6.L1cache.num_tag_array_stalls 49 # number of stalls caused by tag array +system.tcp_cntrl6.coalescer.gpu_tcp_ld_hits 1 # loads that hit in the TCP +system.tcp_cntrl6.coalescer.gpu_tcp_ld_transfers 11 # TCP to TCP load transfers +system.tcp_cntrl6.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl6.coalescer.gpu_ld_misses 1 # loads that miss in the GPU +system.tcp_cntrl6.coalescer.gpu_tcp_st_hits 5 # stores that hit in the TCP +system.tcp_cntrl6.coalescer.gpu_tcp_st_transfers 86 # TCP to TCP store transfers +system.tcp_cntrl6.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl6.coalescer.gpu_st_misses 19 # stores that miss in the GPU +system.tcp_cntrl6.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl6.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl6.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl6.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl6.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl6.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl6.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl6.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.tcp_cntrl7.L1cache.demand_hits 0 # Number of cache demand hits +system.tcp_cntrl7.L1cache.demand_misses 0 # Number of cache demand misses +system.tcp_cntrl7.L1cache.demand_accesses 0 # Number of cache demand accesses +system.tcp_cntrl7.L1cache.num_data_array_reads 10 # number of data array reads +system.tcp_cntrl7.L1cache.num_data_array_writes 97 # number of data array writes +system.tcp_cntrl7.L1cache.num_tag_array_reads 263 # number of tag array reads +system.tcp_cntrl7.L1cache.num_tag_array_writes 256 # number of tag array writes +system.tcp_cntrl7.L1cache.num_tag_array_stalls 11 # number of stalls caused by tag array +system.tcp_cntrl7.coalescer.gpu_tcp_ld_hits 1 # loads that hit in the TCP +system.tcp_cntrl7.coalescer.gpu_tcp_ld_transfers 10 # TCP to TCP load transfers +system.tcp_cntrl7.coalescer.gpu_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl7.coalescer.gpu_ld_misses 1 # loads that miss in the GPU +system.tcp_cntrl7.coalescer.gpu_tcp_st_hits 6 # stores that hit in the TCP +system.tcp_cntrl7.coalescer.gpu_tcp_st_transfers 63 # TCP to TCP store transfers +system.tcp_cntrl7.coalescer.gpu_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl7.coalescer.gpu_st_misses 16 # stores that miss in the GPU +system.tcp_cntrl7.coalescer.cp_tcp_ld_hits 0 # loads that hit in the TCP +system.tcp_cntrl7.coalescer.cp_tcp_ld_transfers 0 # TCP to TCP load transfers +system.tcp_cntrl7.coalescer.cp_tcc_ld_hits 0 # loads that hit in the TCC +system.tcp_cntrl7.coalescer.cp_ld_misses 0 # loads that miss in the GPU +system.tcp_cntrl7.coalescer.cp_tcp_st_hits 0 # stores that hit in the TCP +system.tcp_cntrl7.coalescer.cp_tcp_st_transfers 0 # TCP to TCP store transfers +system.tcp_cntrl7.coalescer.cp_tcc_st_hits 0 # stores that hit in the TCC +system.tcp_cntrl7.coalescer.cp_st_misses 0 # stores that miss in the GPU +system.sqc_cntrl0.L1cache.demand_hits 0 # Number of cache demand hits +system.sqc_cntrl0.L1cache.demand_misses 0 # Number of cache demand misses +system.sqc_cntrl0.L1cache.demand_accesses 0 # Number of cache demand accesses +system.sqc_cntrl0.L1cache.num_data_array_reads 12 # number of data array reads +system.sqc_cntrl0.L1cache.num_data_array_writes 12 # number of data array writes +system.sqc_cntrl0.L1cache.num_tag_array_reads 22 # number of tag array reads +system.sqc_cntrl0.L1cache.num_tag_array_writes 22 # number of tag array writes +system.sqc_cntrl1.L1cache.demand_hits 0 # Number of cache demand hits +system.sqc_cntrl1.L1cache.demand_misses 0 # Number of cache demand misses +system.sqc_cntrl1.L1cache.demand_accesses 0 # Number of cache demand accesses +system.sqc_cntrl1.L1cache.num_data_array_reads 15 # number of data array reads +system.sqc_cntrl1.L1cache.num_data_array_writes 15 # number of data array writes +system.sqc_cntrl1.L1cache.num_tag_array_reads 29 # number of tag array reads +system.sqc_cntrl1.L1cache.num_tag_array_writes 29 # number of tag array writes +system.tcc_cntrl0.L2cache.demand_hits 0 # Number of cache demand hits +system.tcc_cntrl0.L2cache.demand_misses 0 # Number of cache demand misses +system.tcc_cntrl0.L2cache.demand_accesses 0 # Number of cache demand accesses +system.tccdir_cntrl0.directory.demand_hits 0 # Number of cache demand hits +system.tccdir_cntrl0.directory.demand_misses 0 # Number of cache demand misses +system.tccdir_cntrl0.directory.demand_accesses 0 # Number of cache demand accesses +system.tccdir_cntrl0.directory.num_tag_array_reads 917 # number of tag array reads +system.tccdir_cntrl0.directory.num_tag_array_writes 902 # number of tag array writes +system.ruby.network.msg_count.Control 1430 +system.ruby.network.msg_count.Request_Control 1616 +system.ruby.network.msg_count.Response_Data 2430 +system.ruby.network.msg_count.Response_Control 456 +system.ruby.network.msg_count.Writeback_Data 132 +system.ruby.network.msg_count.Writeback_Control 140 +system.ruby.network.msg_count.Unblock_Control 1437 +system.ruby.network.msg_byte.Control 11440 +system.ruby.network.msg_byte.Request_Control 12928 +system.ruby.network.msg_byte.Response_Data 174960 +system.ruby.network.msg_byte.Response_Control 3648 +system.ruby.network.msg_byte.Writeback_Data 9504 +system.ruby.network.msg_byte.Writeback_Control 1120 +system.ruby.network.msg_byte.Unblock_Control 11496 +system.ruby.network.ext_links00.int_node.throttle0.link_utilization 0.250555 +system.ruby.network.ext_links00.int_node.throttle0.msg_count.Request_Control::0 385 +system.ruby.network.ext_links00.int_node.throttle0.msg_count.Response_Data::2 85 +system.ruby.network.ext_links00.int_node.throttle0.msg_count.Response_Control::2 227 +system.ruby.network.ext_links00.int_node.throttle0.msg_count.Writeback_Data::2 66 +system.ruby.network.ext_links00.int_node.throttle0.msg_count.Unblock_Control::4 303 +system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Request_Control::0 3080 +system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Response_Data::2 6120 +system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Response_Control::2 1816 +system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Writeback_Data::2 4752 +system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Unblock_Control::4 2424 +system.ruby.network.ext_links00.int_node.throttle1.link_utilization 0.113047 +system.ruby.network.ext_links00.int_node.throttle1.msg_count.Control::0 227 +system.ruby.network.ext_links00.int_node.throttle1.msg_count.Response_Data::2 81 +system.ruby.network.ext_links00.int_node.throttle1.msg_count.Writeback_Control::2 70 +system.ruby.network.ext_links00.int_node.throttle1.msg_bytes.Control::0 1816 +system.ruby.network.ext_links00.int_node.throttle1.msg_bytes.Response_Data::2 5832 +system.ruby.network.ext_links00.int_node.throttle1.msg_bytes.Writeback_Control::2 560 +system.ruby.network.ext_links00.int_node.throttle2.link_utilization 0.234028 +system.ruby.network.ext_links00.int_node.throttle2.msg_count.Control::0 81 +system.ruby.network.ext_links00.int_node.throttle2.msg_count.Response_Data::2 227 +system.ruby.network.ext_links00.int_node.throttle2.msg_bytes.Control::0 648 +system.ruby.network.ext_links00.int_node.throttle2.msg_bytes.Response_Data::2 16344 +system.ruby.network.ext_links01.int_node.throttle0.link_utilization 0.113047 +system.ruby.network.ext_links01.int_node.throttle0.msg_count.Control::0 227 +system.ruby.network.ext_links01.int_node.throttle0.msg_count.Response_Data::2 81 +system.ruby.network.ext_links01.int_node.throttle0.msg_count.Writeback_Control::2 70 +system.ruby.network.ext_links01.int_node.throttle0.msg_bytes.Control::0 1816 +system.ruby.network.ext_links01.int_node.throttle0.msg_bytes.Response_Data::2 5832 +system.ruby.network.ext_links01.int_node.throttle0.msg_bytes.Writeback_Control::2 560 +system.ruby.network.ext_links01.int_node.throttle1.link_utilization 0.128914 +system.ruby.network.ext_links01.int_node.throttle1.msg_count.Request_Control::0 153 +system.ruby.network.ext_links01.int_node.throttle1.msg_count.Response_Data::2 14 +system.ruby.network.ext_links01.int_node.throttle1.msg_count.Response_Control::2 217 +system.ruby.network.ext_links01.int_node.throttle1.msg_count.Writeback_Data::2 66 +system.ruby.network.ext_links01.int_node.throttle1.msg_count.Unblock_Control::4 80 +system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Request_Control::0 1224 +system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Response_Data::2 1008 +system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Response_Control::2 1736 +system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Writeback_Data::2 4752 +system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Unblock_Control::4 640 +system.ruby.network.ext_links02.int_node.throttle0.link_utilization 0.115361 +system.ruby.network.ext_links02.int_node.throttle0.msg_count.Control::1 102 +system.ruby.network.ext_links02.int_node.throttle0.msg_count.Response_Data::3 105 +system.ruby.network.ext_links02.int_node.throttle0.msg_bytes.Control::1 816 +system.ruby.network.ext_links02.int_node.throttle0.msg_bytes.Response_Data::3 7560 +system.ruby.network.ext_links02.int_node.throttle1.link_utilization 0.108750 +system.ruby.network.ext_links02.int_node.throttle1.msg_count.Control::1 96 +system.ruby.network.ext_links02.int_node.throttle1.msg_count.Response_Data::3 99 +system.ruby.network.ext_links02.int_node.throttle1.msg_bytes.Control::1 768 +system.ruby.network.ext_links02.int_node.throttle1.msg_bytes.Response_Data::3 7128 +system.ruby.network.ext_links02.int_node.throttle2.link_utilization 0.109742 +system.ruby.network.ext_links02.int_node.throttle2.msg_count.Control::1 105 +system.ruby.network.ext_links02.int_node.throttle2.msg_count.Response_Data::3 99 +system.ruby.network.ext_links02.int_node.throttle2.msg_bytes.Control::1 840 +system.ruby.network.ext_links02.int_node.throttle2.msg_bytes.Response_Data::3 7128 +system.ruby.network.ext_links02.int_node.throttle3.link_utilization 0.102690 +system.ruby.network.ext_links02.int_node.throttle3.msg_count.Control::1 86 +system.ruby.network.ext_links02.int_node.throttle3.msg_count.Response_Data::3 94 +system.ruby.network.ext_links02.int_node.throttle3.msg_bytes.Control::1 688 +system.ruby.network.ext_links02.int_node.throttle3.msg_bytes.Response_Data::3 6768 +system.ruby.network.ext_links02.int_node.throttle4.link_utilization 0.116573 +system.ruby.network.ext_links02.int_node.throttle4.msg_count.Control::1 104 +system.ruby.network.ext_links02.int_node.throttle4.msg_count.Response_Data::3 106 +system.ruby.network.ext_links02.int_node.throttle4.msg_bytes.Control::1 832 +system.ruby.network.ext_links02.int_node.throttle4.msg_bytes.Response_Data::3 7632 +system.ruby.network.ext_links02.int_node.throttle5.link_utilization 0.107759 +system.ruby.network.ext_links02.int_node.throttle5.msg_count.Control::1 96 +system.ruby.network.ext_links02.int_node.throttle5.msg_count.Response_Data::3 98 +system.ruby.network.ext_links02.int_node.throttle5.msg_bytes.Control::1 768 +system.ruby.network.ext_links02.int_node.throttle5.msg_bytes.Response_Data::3 7056 +system.ruby.network.ext_links02.int_node.throttle6.link_utilization 0.128473 +system.ruby.network.ext_links02.int_node.throttle6.msg_count.Control::1 113 +system.ruby.network.ext_links02.int_node.throttle6.msg_count.Response_Data::3 117 +system.ruby.network.ext_links02.int_node.throttle6.msg_bytes.Control::1 904 +system.ruby.network.ext_links02.int_node.throttle6.msg_bytes.Response_Data::3 8424 +system.ruby.network.ext_links02.int_node.throttle7.link_utilization 0.098944 +system.ruby.network.ext_links02.int_node.throttle7.msg_count.Control::1 88 +system.ruby.network.ext_links02.int_node.throttle7.msg_count.Response_Data::3 90 +system.ruby.network.ext_links02.int_node.throttle7.msg_bytes.Control::1 704 +system.ruby.network.ext_links02.int_node.throttle7.msg_bytes.Response_Data::3 6480 +system.ruby.network.ext_links02.int_node.throttle8.link_utilization 0 +system.ruby.network.ext_links02.int_node.throttle9.link_utilization 1.221264 +system.ruby.network.ext_links02.int_node.throttle9.msg_count.Control::0 81 +system.ruby.network.ext_links02.int_node.throttle9.msg_count.Request_Control::1 846 +system.ruby.network.ext_links02.int_node.throttle9.msg_count.Response_Data::2 227 +system.ruby.network.ext_links02.int_node.throttle9.msg_count.Response_Data::3 809 +system.ruby.network.ext_links02.int_node.throttle9.msg_count.Response_Control::3 2 +system.ruby.network.ext_links02.int_node.throttle9.msg_count.Unblock_Control::5 831 +system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Control::0 648 +system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Request_Control::1 6768 +system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Response_Data::2 16344 +system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Response_Data::3 58248 +system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Response_Control::3 16 +system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Unblock_Control::5 6648 +system.ruby.network.ext_links02.int_node.throttle10.link_utilization 0.013002 +system.ruby.network.ext_links02.int_node.throttle10.msg_count.Control::1 10 +system.ruby.network.ext_links02.int_node.throttle10.msg_count.Response_Data::3 12 +system.ruby.network.ext_links02.int_node.throttle10.msg_bytes.Control::1 80 +system.ruby.network.ext_links02.int_node.throttle10.msg_bytes.Response_Data::3 864 +system.ruby.network.ext_links02.int_node.throttle11.link_utilization 0.016417 +system.ruby.network.ext_links02.int_node.throttle11.msg_count.Control::1 14 +system.ruby.network.ext_links02.int_node.throttle11.msg_count.Response_Data::3 15 +system.ruby.network.ext_links02.int_node.throttle11.msg_bytes.Control::1 112 +system.ruby.network.ext_links02.int_node.throttle11.msg_bytes.Response_Data::3 1080 +system.ruby.network.ext_links02.int_node.throttle12.link_utilization 0.121642 +system.ruby.network.ext_links02.int_node.throttle12.msg_count.Request_Control::0 232 +system.ruby.network.ext_links02.int_node.throttle12.msg_count.Response_Data::2 71 +system.ruby.network.ext_links02.int_node.throttle12.msg_count.Response_Control::2 10 +system.ruby.network.ext_links02.int_node.throttle12.msg_count.Unblock_Control::4 223 +system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Request_Control::0 1856 +system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Response_Data::2 5112 +system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Response_Control::2 80 +system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Unblock_Control::4 1784 +system.ruby.CorePair_Controller.C0_Load_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.C1_Load_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch0_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.Ifetch1_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1miss 45 0.00% 0.00% +system.ruby.CorePair_Controller.C0_Store_L1hit 2 0.00% 0.00% +system.ruby.CorePair_Controller.C1_Store_L1miss 73 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckS 4 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckM 77 0.00% 0.00% +system.ruby.CorePair_Controller.NB_AckWB 70 0.00% 0.00% +system.ruby.CorePair_Controller.L1D0_Repl 19 0.00% 0.00% +system.ruby.CorePair_Controller.L2_Repl 36624 0.00% 0.00% +system.ruby.CorePair_Controller.PrbInvData 223 0.00% 0.00% +system.ruby.CorePair_Controller.PrbShrData 4 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Load_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.I.C1_Load_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.I.Ifetch0_L1miss 2 0.00% 0.00% +system.ruby.CorePair_Controller.I.Ifetch1_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.I.C0_Store_L1miss 41 0.00% 0.00% +system.ruby.CorePair_Controller.I.C1_Store_L1miss 37 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbInvData 209 0.00% 0.00% +system.ruby.CorePair_Controller.I.PrbShrData 3 0.00% 0.00% +system.ruby.CorePair_Controller.S.L2_Repl 3 0.00% 0.00% +system.ruby.CorePair_Controller.S.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.O.PrbInvData 1 0.00% 0.00% +system.ruby.CorePair_Controller.M0.C0_Store_L1hit 2 0.00% 0.00% +system.ruby.CorePair_Controller.M0.L2_Repl 33 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbInvData 5 0.00% 0.00% +system.ruby.CorePair_Controller.M0.PrbShrData 1 0.00% 0.00% +system.ruby.CorePair_Controller.M1.C0_Store_L1miss 1 0.00% 0.00% +system.ruby.CorePair_Controller.M1.L2_Repl 36 0.00% 0.00% +system.ruby.CorePair_Controller.M1.PrbInvData 2 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.C1_Store_L1miss 5 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.NB_AckM 35 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.L1D0_Repl 11 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0.L2_Repl 16208 0.00% 0.00% +system.ruby.CorePair_Controller.I_M1.C0_Store_L1miss 3 0.00% 0.00% +system.ruby.CorePair_Controller.I_M1.NB_AckM 34 0.00% 0.00% +system.ruby.CorePair_Controller.I_M1.L2_Repl 14782 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0M1.NB_AckM 5 0.00% 0.00% +system.ruby.CorePair_Controller.I_M0M1.L2_Repl 3020 0.00% 0.00% +system.ruby.CorePair_Controller.I_M1M0.NB_AckM 3 0.00% 0.00% +system.ruby.CorePair_Controller.I_M1M0.L2_Repl 1059 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.NB_AckS 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.L1D0_Repl 8 0.00% 0.00% +system.ruby.CorePair_Controller.I_E0S.L2_Repl 493 0.00% 0.00% +system.ruby.CorePair_Controller.I_E1S.NB_AckS 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_E1S.L2_Repl 638 0.00% 0.00% +system.ruby.CorePair_Controller.ES_I.NB_AckWB 2 0.00% 0.00% +system.ruby.CorePair_Controller.MO_I.NB_AckWB 64 0.00% 0.00% +system.ruby.CorePair_Controller.MO_I.PrbInvData 5 0.00% 0.00% +system.ruby.CorePair_Controller.S0.C1_Store_L1miss 31 0.00% 0.00% +system.ruby.CorePair_Controller.S0.NB_AckS 1 0.00% 0.00% +system.ruby.CorePair_Controller.S0.L2_Repl 352 0.00% 0.00% +system.ruby.CorePair_Controller.S1.NB_AckS 1 0.00% 0.00% +system.ruby.CorePair_Controller.I_C.NB_AckWB 4 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkS 3 0.00% 0.00% +system.ruby.Directory_Controller.RdBlkM 309 0.00% 0.00% +system.ruby.Directory_Controller.RdBlk 6 0.00% 0.00% +system.ruby.Directory_Controller.VicDirty 68 0.00% 0.00% +system.ruby.Directory_Controller.VicClean 2 0.00% 0.00% +system.ruby.Directory_Controller.CPUData 66 0.00% 0.00% +system.ruby.Directory_Controller.StaleWB 4 0.00% 0.00% +system.ruby.Directory_Controller.CPUPrbResp 308 0.00% 0.00% +system.ruby.Directory_Controller.ProbeAcksComplete 308 0.00% 0.00% +system.ruby.Directory_Controller.L3Hit 49 0.00% 0.00% +system.ruby.Directory_Controller.MemData 259 0.00% 0.00% +system.ruby.Directory_Controller.WBAck 9 0.00% 0.00% +system.ruby.Directory_Controller.CoreUnblock 303 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkS 3 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlkM 300 0.00% 0.00% +system.ruby.Directory_Controller.U.RdBlk 5 0.00% 0.00% +system.ruby.Directory_Controller.U.VicDirty 68 0.00% 0.00% +system.ruby.Directory_Controller.U.VicClean 2 0.00% 0.00% +system.ruby.Directory_Controller.U.WBAck 9 0.00% 0.00% +system.ruby.Directory_Controller.BL.RdBlkM 1 0.00% 0.00% +system.ruby.Directory_Controller.BL.CPUData 66 0.00% 0.00% +system.ruby.Directory_Controller.BL.StaleWB 4 0.00% 0.00% +system.ruby.Directory_Controller.BM_M.MemData 8 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.L3Hit 1 0.00% 0.00% +system.ruby.Directory_Controller.BS_PM.MemData 2 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.RdBlkM 1 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.CPUPrbResp 12 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete 8 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.L3Hit 46 0.00% 0.00% +system.ruby.Directory_Controller.BM_PM.MemData 246 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.L3Hit 2 0.00% 0.00% +system.ruby.Directory_Controller.B_PM.MemData 3 0.00% 0.00% +system.ruby.Directory_Controller.BS_Pm.CPUPrbResp 3 0.00% 0.00% +system.ruby.Directory_Controller.BS_Pm.ProbeAcksComplete 3 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.RdBlkM 3 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.CPUPrbResp 288 0.00% 0.00% +system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete 292 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.CPUPrbResp 5 0.00% 0.00% +system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete 5 0.00% 0.00% +system.ruby.Directory_Controller.B.RdBlkM 4 0.00% 0.00% +system.ruby.Directory_Controller.B.RdBlk 1 0.00% 0.00% +system.ruby.Directory_Controller.B.CoreUnblock 303 0.00% 0.00% +system.ruby.LD.latency_hist::bucket_size 1024 +system.ruby.LD.latency_hist::max_bucket 10239 +system.ruby.LD.latency_hist::samples 1 +system.ruby.LD.latency_hist::mean 5324 +system.ruby.LD.latency_hist::gmean 5324.000000 +system.ruby.LD.latency_hist::stdev nan +system.ruby.LD.latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.latency_hist::total 1 +system.ruby.LD.hit_latency_hist::bucket_size 1024 +system.ruby.LD.hit_latency_hist::max_bucket 10239 +system.ruby.LD.hit_latency_hist::samples 1 +system.ruby.LD.hit_latency_hist::mean 5324 +system.ruby.LD.hit_latency_hist::gmean 5324.000000 +system.ruby.LD.hit_latency_hist::stdev nan +system.ruby.LD.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.hit_latency_hist::total 1 +system.ruby.ST.latency_hist::bucket_size 1024 +system.ruby.ST.latency_hist::max_bucket 10239 +system.ruby.ST.latency_hist::samples 46 +system.ruby.ST.latency_hist::mean 3269.239130 +system.ruby.ST.latency_hist::gmean 1783.447677 +system.ruby.ST.latency_hist::stdev 1934.416354 +system.ruby.ST.latency_hist | 11 23.91% 23.91% | 3 6.52% 30.43% | 3 6.52% 36.96% | 7 15.22% 52.17% | 18 39.13% 91.30% | 4 8.70% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.latency_hist::total 46 +system.ruby.ST.hit_latency_hist::bucket_size 1024 +system.ruby.ST.hit_latency_hist::max_bucket 10239 +system.ruby.ST.hit_latency_hist::samples 40 +system.ruby.ST.hit_latency_hist::mean 3606.650000 +system.ruby.ST.hit_latency_hist::gmean 2691.718970 +system.ruby.ST.hit_latency_hist::stdev 1792.166924 +system.ruby.ST.hit_latency_hist | 7 17.50% 17.50% | 3 7.50% 25.00% | 1 2.50% 27.50% | 7 17.50% 45.00% | 18 45.00% 90.00% | 4 10.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.hit_latency_hist::total 40 +system.ruby.ST.miss_latency_hist::bucket_size 512 +system.ruby.ST.miss_latency_hist::max_bucket 5119 +system.ruby.ST.miss_latency_hist::samples 6 +system.ruby.ST.miss_latency_hist::mean 1019.833333 +system.ruby.ST.miss_latency_hist::gmean 114.673945 +system.ruby.ST.miss_latency_hist::stdev 1281.644790 +system.ruby.ST.miss_latency_hist | 3 50.00% 50.00% | 1 16.67% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 2 33.33% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.miss_latency_hist::total 6 +system.ruby.IFETCH.latency_hist::bucket_size 1024 +system.ruby.IFETCH.latency_hist::max_bucket 10239 +system.ruby.IFETCH.latency_hist::samples 1 +system.ruby.IFETCH.latency_hist::mean 5156 +system.ruby.IFETCH.latency_hist::gmean 5156.000000 +system.ruby.IFETCH.latency_hist::stdev nan +system.ruby.IFETCH.latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.latency_hist::total 1 +system.ruby.IFETCH.hit_latency_hist::bucket_size 1024 +system.ruby.IFETCH.hit_latency_hist::max_bucket 10239 +system.ruby.IFETCH.hit_latency_hist::samples 1 +system.ruby.IFETCH.hit_latency_hist::mean 5156 +system.ruby.IFETCH.hit_latency_hist::gmean 5156.000000 +system.ruby.IFETCH.hit_latency_hist::stdev nan +system.ruby.IFETCH.hit_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.hit_latency_hist::total 1 +system.ruby.L1Cache.miss_mach_latency_hist::bucket_size 512 +system.ruby.L1Cache.miss_mach_latency_hist::max_bucket 5119 +system.ruby.L1Cache.miss_mach_latency_hist::samples 6 +system.ruby.L1Cache.miss_mach_latency_hist::mean 1019.833333 +system.ruby.L1Cache.miss_mach_latency_hist::gmean 114.673945 +system.ruby.L1Cache.miss_mach_latency_hist::stdev 1281.644790 +system.ruby.L1Cache.miss_mach_latency_hist | 3 50.00% 50.00% | 1 16.67% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 2 33.33% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.L1Cache.miss_mach_latency_hist::total 6 +system.ruby.Directory.hit_mach_latency_hist::bucket_size 1024 +system.ruby.Directory.hit_mach_latency_hist::max_bucket 10239 +system.ruby.Directory.hit_mach_latency_hist::samples 42 +system.ruby.Directory.hit_mach_latency_hist::mean 3684.428571 +system.ruby.Directory.hit_mach_latency_hist::gmean 2778.454716 +system.ruby.Directory.hit_mach_latency_hist::stdev 1783.107224 +system.ruby.Directory.hit_mach_latency_hist | 7 16.67% 16.67% | 3 7.14% 23.81% | 1 2.38% 26.19% | 7 16.67% 42.86% | 18 42.86% 85.71% | 6 14.29% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.Directory.hit_mach_latency_hist::total 42 +system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size 1024 +system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket 10239 +system.ruby.LD.Directory.hit_type_mach_latency_hist::samples 1 +system.ruby.LD.Directory.hit_type_mach_latency_hist::mean 5324 +system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean 5324.000000 +system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev nan +system.ruby.LD.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.LD.Directory.hit_type_mach_latency_hist::total 1 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size 512 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket 5119 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples 6 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean 1019.833333 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean 114.673945 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::stdev 1281.644790 +system.ruby.ST.L1Cache.miss_type_mach_latency_hist | 3 50.00% 50.00% | 1 16.67% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 0 0.00% 66.67% | 2 33.33% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total 6 +system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size 1024 +system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket 10239 +system.ruby.ST.Directory.hit_type_mach_latency_hist::samples 40 +system.ruby.ST.Directory.hit_type_mach_latency_hist::mean 3606.650000 +system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean 2691.718970 +system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev 1792.166924 +system.ruby.ST.Directory.hit_type_mach_latency_hist | 7 17.50% 17.50% | 3 7.50% 25.00% | 1 2.50% 27.50% | 7 17.50% 45.00% | 18 45.00% 90.00% | 4 10.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.ST.Directory.hit_type_mach_latency_hist::total 40 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size 1024 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket 10239 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples 1 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean 5156 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean 5156.000000 +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev nan +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total 1 +system.ruby.SQC_Controller.Fetch | 12 44.44% 44.44% | 15 55.56% 100.00% +system.ruby.SQC_Controller.Fetch::total 27 +system.ruby.SQC_Controller.TCC_AckS | 12 44.44% 44.44% | 15 55.56% 100.00% +system.ruby.SQC_Controller.TCC_AckS::total 27 +system.ruby.SQC_Controller.PrbInvData | 10 41.67% 41.67% | 14 58.33% 100.00% +system.ruby.SQC_Controller.PrbInvData::total 24 +system.ruby.SQC_Controller.I.Fetch | 12 44.44% 44.44% | 15 55.56% 100.00% +system.ruby.SQC_Controller.I.Fetch::total 27 +system.ruby.SQC_Controller.S.PrbInvData | 10 41.67% 41.67% | 14 58.33% 100.00% +system.ruby.SQC_Controller.S.PrbInvData::total 24 +system.ruby.SQC_Controller.I_S.TCC_AckS | 12 44.44% 44.44% | 15 55.56% 100.00% +system.ruby.SQC_Controller.I_S.TCC_AckS::total 27 +system.ruby.TCCdir_Controller.RdBlk 174 0.00% 0.00% +system.ruby.TCCdir_Controller.RdBlkM 2638 0.00% 0.00% +system.ruby.TCCdir_Controller.RdBlkS 195 0.00% 0.00% +system.ruby.TCCdir_Controller.CPUPrbResp 811 0.00% 0.00% +system.ruby.TCCdir_Controller.ProbeAcksComplete 751 0.00% 0.00% +system.ruby.TCCdir_Controller.CoreUnblock 829 0.00% 0.00% +system.ruby.TCCdir_Controller.LastCoreUnblock 2 0.00% 0.00% +system.ruby.TCCdir_Controller.NB_AckS 2 0.00% 0.00% +system.ruby.TCCdir_Controller.NB_AckE 2 0.00% 0.00% +system.ruby.TCCdir_Controller.NB_AckM 223 0.00% 0.00% +system.ruby.TCCdir_Controller.PrbInvData 112 0.00% 0.00% +system.ruby.TCCdir_Controller.PrbShrData 4 0.00% 0.00% +system.ruby.TCCdir_Controller.I.RdBlk 3 0.00% 0.00% +system.ruby.TCCdir_Controller.I.RdBlkM 156 0.00% 0.00% +system.ruby.TCCdir_Controller.I.RdBlkS 1 0.00% 0.00% +system.ruby.TCCdir_Controller.I.PrbInvData 9 0.00% 0.00% +system.ruby.TCCdir_Controller.S.RdBlkM 2 0.00% 0.00% +system.ruby.TCCdir_Controller.S.RdBlkS 1 0.00% 0.00% +system.ruby.TCCdir_Controller.E.RdBlkM 1 0.00% 0.00% +system.ruby.TCCdir_Controller.O.RdBlk 1 0.00% 0.00% +system.ruby.TCCdir_Controller.O.RdBlkM 70 0.00% 0.00% +system.ruby.TCCdir_Controller.O.PrbInvData 6 0.00% 0.00% +system.ruby.TCCdir_Controller.M.RdBlk 61 0.00% 0.00% +system.ruby.TCCdir_Controller.M.RdBlkM 521 0.00% 0.00% +system.ruby.TCCdir_Controller.M.RdBlkS 25 0.00% 0.00% +system.ruby.TCCdir_Controller.M.PrbInvData 59 0.00% 0.00% +system.ruby.TCCdir_Controller.M.PrbShrData 4 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.RdBlk 9 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.RdBlkM 15 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.RdBlkS 7 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.CPUPrbResp 71 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_I.ProbeAcksComplete 65 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_O.RdBlkM 4 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_O.CPUPrbResp 4 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_O.ProbeAcksComplete 4 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_IOM.RdBlkM 5 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_IOM.CPUPrbResp 2 0.00% 0.00% +system.ruby.TCCdir_Controller.CP_IOM.ProbeAcksComplete 2 0.00% 0.00% +system.ruby.TCCdir_Controller.I_M.RdBlkM 897 0.00% 0.00% +system.ruby.TCCdir_Controller.I_M.RdBlkS 30 0.00% 0.00% +system.ruby.TCCdir_Controller.I_M.NB_AckM 156 0.00% 0.00% +system.ruby.TCCdir_Controller.I_M.PrbInvData 1 0.00% 0.00% +system.ruby.TCCdir_Controller.I_ES.RdBlkM 24 0.00% 0.00% +system.ruby.TCCdir_Controller.I_ES.RdBlkS 34 0.00% 0.00% +system.ruby.TCCdir_Controller.I_ES.NB_AckS 1 0.00% 0.00% +system.ruby.TCCdir_Controller.I_ES.NB_AckE 2 0.00% 0.00% +system.ruby.TCCdir_Controller.I_S.RdBlkM 11 0.00% 0.00% +system.ruby.TCCdir_Controller.I_S.NB_AckS 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_S.RdBlkM 5 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_S.CPUPrbResp 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_S.ProbeAcksComplete 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBO_O.CPUPrbResp 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBO_O.ProbeAcksComplete 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.RdBlk 11 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.RdBlkM 104 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.RdBlkS 12 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.CPUPrbResp 520 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.ProbeAcksComplete 520 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_M.PrbInvData 14 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_O.RdBlkM 13 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_O.CPUPrbResp 86 0.00% 0.00% +system.ruby.TCCdir_Controller.BBM_O.ProbeAcksComplete 86 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_M.RdBlk 20 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_M.RdBlkM 181 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_M.RdBlkS 15 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_M.CoreUnblock 518 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_M.PrbInvData 19 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_O.RdBlkM 35 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_O.CoreUnblock 84 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_O.PrbInvData 2 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_OO.LastCoreUnblock 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_S.RdBlkM 9 0.00% 0.00% +system.ruby.TCCdir_Controller.BB_S.LastCoreUnblock 1 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_M.RdBlk 9 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_M.RdBlkM 18 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_M.CPUPrbResp 4 0.00% 0.00% +system.ruby.TCCdir_Controller.BBS_M.ProbeAcksComplete 3 0.00% 0.00% +system.ruby.TCCdir_Controller.BBO_M.RdBlkM 20 0.00% 0.00% +system.ruby.TCCdir_Controller.BBO_M.CPUPrbResp 122 0.00% 0.00% +system.ruby.TCCdir_Controller.BBO_M.ProbeAcksComplete 69 0.00% 0.00% +system.ruby.TCCdir_Controller.S_M.RdBlk 28 0.00% 0.00% +system.ruby.TCCdir_Controller.S_M.RdBlkM 69 0.00% 0.00% +system.ruby.TCCdir_Controller.S_M.NB_AckM 3 0.00% 0.00% +system.ruby.TCCdir_Controller.O_M.RdBlk 20 0.00% 0.00% +system.ruby.TCCdir_Controller.O_M.RdBlkM 249 0.00% 0.00% +system.ruby.TCCdir_Controller.O_M.RdBlkS 51 0.00% 0.00% +system.ruby.TCCdir_Controller.O_M.NB_AckM 64 0.00% 0.00% +system.ruby.TCCdir_Controller.O_M.PrbInvData 2 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_S.RdBlk 3 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_S.RdBlkM 23 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_S.RdBlkS 5 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_S.CoreUnblock 2 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_M.RdBlk 9 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_M.RdBlkM 206 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_M.RdBlkS 14 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_M.CoreUnblock 223 0.00% 0.00% +system.ruby.TCCdir_Controller.BBB_E.CoreUnblock 2 0.00% 0.00% +system.ruby.TCP_Controller.Load | 5 7.04% 7.04% | 6 8.45% 15.49% | 10 14.08% 29.58% | 13 18.31% 47.89% | 6 8.45% 56.34% | 6 8.45% 64.79% | 13 18.31% 83.10% | 12 16.90% 100.00% +system.ruby.TCP_Controller.Load::total 71 +system.ruby.TCP_Controller.Store | 109 13.39% 13.39% | 104 12.78% 26.17% | 98 12.04% 38.21% | 93 11.43% 49.63% | 109 13.39% 63.02% | 102 12.53% 75.55% | 113 13.88% 89.43% | 86 10.57% 100.00% +system.ruby.TCP_Controller.Store::total 814 +system.ruby.TCP_Controller.TCC_AckS | 5 7.94% 7.94% | 5 7.94% 15.87% | 9 14.29% 30.16% | 13 20.63% 50.79% | 4 6.35% 57.14% | 6 9.52% 66.67% | 11 17.46% 84.13% | 10 15.87% 100.00% +system.ruby.TCP_Controller.TCC_AckS::total 63 +system.ruby.TCP_Controller.TCC_AckE | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.TCC_AckE::total 2 +system.ruby.TCP_Controller.TCC_AckM | 100 13.46% 13.46% | 94 12.65% 26.11% | 90 12.11% 38.22% | 81 10.90% 49.13% | 102 13.73% 62.85% | 92 12.38% 75.24% | 105 14.13% 89.37% | 79 10.63% 100.00% +system.ruby.TCP_Controller.TCC_AckM::total 743 +system.ruby.TCP_Controller.PrbInvData | 88 12.61% 12.61% | 87 12.46% 25.07% | 88 12.61% 37.68% | 79 11.32% 49.00% | 90 12.89% 61.89% | 86 12.32% 74.21% | 101 14.47% 88.68% | 79 11.32% 100.00% +system.ruby.TCP_Controller.PrbInvData::total 698 +system.ruby.TCP_Controller.PrbShrData | 14 15.22% 15.22% | 9 9.78% 25.00% | 17 18.48% 43.48% | 7 7.61% 51.09% | 14 15.22% 66.30% | 10 10.87% 77.17% | 12 13.04% 90.22% | 9 9.78% 100.00% +system.ruby.TCP_Controller.PrbShrData::total 92 +system.ruby.TCP_Controller.I.Load | 5 7.46% 7.46% | 5 7.46% 14.93% | 9 13.43% 28.36% | 13 19.40% 47.76% | 6 8.96% 56.72% | 6 8.96% 65.67% | 12 17.91% 83.58% | 11 16.42% 100.00% +system.ruby.TCP_Controller.I.Load::total 67 +system.ruby.TCP_Controller.I.Store | 98 13.26% 13.26% | 95 12.86% 26.12% | 89 12.04% 38.16% | 82 11.10% 49.26% | 99 13.40% 62.65% | 93 12.58% 75.24% | 105 14.21% 89.45% | 78 10.55% 100.00% +system.ruby.TCP_Controller.I.Store::total 739 +system.ruby.TCP_Controller.I.PrbInvData | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 50.00% 50.00% | 1 50.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.TCP_Controller.I.PrbInvData::total 2 +system.ruby.TCP_Controller.S.Store | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 20.00% 20.00% | 1 20.00% 40.00% | 0 0.00% 40.00% | 2 40.00% 80.00% | 1 20.00% 100.00% +system.ruby.TCP_Controller.S.Store::total 5 +system.ruby.TCP_Controller.S.PrbInvData | 4 8.33% 8.33% | 4 8.33% 16.67% | 8 16.67% 33.33% | 9 18.75% 52.08% | 3 6.25% 58.33% | 4 8.33% 66.67% | 8 16.67% 83.33% | 8 16.67% 100.00% +system.ruby.TCP_Controller.S.PrbInvData::total 48 +system.ruby.TCP_Controller.S.PrbShrData | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.TCP_Controller.S.PrbShrData::total 1 +system.ruby.TCP_Controller.E.PrbInvData | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% +system.ruby.TCP_Controller.E.PrbInvData::total 1 +system.ruby.TCP_Controller.O.Store | 2 20.00% 20.00% | 0 0.00% 20.00% | 2 20.00% 40.00% | 0 0.00% 40.00% | 3 30.00% 70.00% | 1 10.00% 80.00% | 1 10.00% 90.00% | 1 10.00% 100.00% +system.ruby.TCP_Controller.O.Store::total 10 +system.ruby.TCP_Controller.O.PrbInvData | 9 13.64% 13.64% | 7 10.61% 24.24% | 12 18.18% 42.42% | 7 10.61% 53.03% | 10 15.15% 68.18% | 5 7.58% 75.76% | 10 15.15% 90.91% | 6 9.09% 100.00% +system.ruby.TCP_Controller.O.PrbInvData::total 66 +system.ruby.TCP_Controller.O.PrbShrData | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.TCP_Controller.O.PrbShrData::total 1 +system.ruby.TCP_Controller.M.Load | 0 0.00% 0.00% | 1 25.00% 25.00% | 1 25.00% 50.00% | 0 0.00% 50.00% | 0 0.00% 50.00% | 0 0.00% 50.00% | 1 25.00% 75.00% | 1 25.00% 100.00% +system.ruby.TCP_Controller.M.Load::total 4 +system.ruby.TCP_Controller.M.Store | 9 15.00% 15.00% | 9 15.00% 30.00% | 7 11.67% 41.67% | 10 16.67% 58.33% | 6 10.00% 68.33% | 8 13.33% 81.67% | 5 8.33% 90.00% | 6 10.00% 100.00% +system.ruby.TCP_Controller.M.Store::total 60 +system.ruby.TCP_Controller.M.PrbInvData | 75 12.93% 12.93% | 76 13.10% 26.03% | 67 11.55% 37.59% | 62 10.69% 48.28% | 76 13.10% 61.38% | 77 13.28% 74.66% | 82 14.14% 88.79% | 65 11.21% 100.00% +system.ruby.TCP_Controller.M.PrbInvData::total 580 +system.ruby.TCP_Controller.M.PrbShrData | 14 15.56% 15.56% | 8 8.89% 24.44% | 16 17.78% 42.22% | 7 7.78% 50.00% | 14 15.56% 65.56% | 10 11.11% 76.67% | 12 13.33% 90.00% | 9 10.00% 100.00% +system.ruby.TCP_Controller.M.PrbShrData::total 90 +system.ruby.TCP_Controller.I_M.TCC_AckM | 98 13.42% 13.42% | 94 12.88% 26.30% | 89 12.19% 38.49% | 80 10.96% 49.45% | 98 13.42% 62.88% | 91 12.47% 75.34% | 103 14.11% 89.45% | 77 10.55% 100.00% +system.ruby.TCP_Controller.I_M.TCC_AckM::total 730 +system.ruby.TCP_Controller.I_ES.TCC_AckS | 5 7.94% 7.94% | 5 7.94% 15.87% | 9 14.29% 30.16% | 13 20.63% 50.79% | 4 6.35% 57.14% | 6 9.52% 66.67% | 11 17.46% 84.13% | 10 15.87% 100.00% +system.ruby.TCP_Controller.I_ES.TCC_AckS::total 63 +system.ruby.TCP_Controller.I_ES.TCC_AckE | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 50.00% 50.00% | 1 50.00% 100.00% +system.ruby.TCP_Controller.I_ES.TCC_AckE::total 2 +system.ruby.TCP_Controller.S_M.TCC_AckM | 0 0.00% 0.00% | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 25.00% 25.00% | 1 25.00% 50.00% | 0 0.00% 50.00% | 1 25.00% 75.00% | 1 25.00% 100.00% +system.ruby.TCP_Controller.S_M.TCC_AckM::total 4 +system.ruby.TCP_Controller.O_M.TCC_AckM | 2 22.22% 22.22% | 0 0.00% 22.22% | 1 11.11% 33.33% | 0 0.00% 33.33% | 3 33.33% 66.67% | 1 11.11% 77.78% | 1 11.11% 88.89% | 1 11.11% 100.00% +system.ruby.TCP_Controller.O_M.TCC_AckM::total 9 +system.ruby.TCP_Controller.O_M.PrbInvData | 0 0.00% 0.00% | 0 0.00% 0.00% | 1 100.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% | 0 0.00% 100.00% +system.ruby.TCP_Controller.O_M.PrbInvData::total 1 + +---------- End Simulation Statistics ---------- diff --git a/tests/quick/se/60.gpu-randomtest/test.py b/tests/quick/se/60.gpu-randomtest/test.py new file mode 100644 index 000000000..d47bac621 --- /dev/null +++ b/tests/quick/se/60.gpu-randomtest/test.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2010-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Brad Beckmann +# + diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello new file mode 100755 index 000000000..de248ee4a Binary files /dev/null and b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello differ diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm new file mode 100644 index 000000000..a4ad14488 Binary files /dev/null and b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm differ diff --git a/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl b/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl new file mode 100755 index 000000000..1f61a6fab --- /dev/null +++ b/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Marc Orr + */ + + +__kernel void read_kernel(size_t code_size, + __global char *code_in, + __global int *key_arr, + __global char *msg_out, + __global int *chars_decoded) +{ + size_t gid = get_global_id(0); + size_t my_idx = gid % code_size; + bool decode = 0; + __local atomic_int lcount; + + if (get_local_id(0) == 0) { + lcount=0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // read code + char mycode = code_in[my_idx]; + + // decode + int my_key = key_arr[my_idx]; + if (my_key) { + decode = 1; + for (int n = 0; n < my_key; n++) { + mycode++; + } + } + + // write out msg + msg_out[gid] = mycode; + + if (decode) { + atomic_fetch_add((atomic_int *)(&lcount), 1); + } + barrier(CLK_LOCAL_MEM_FENCE); + + + if(get_local_id(0) == 0) { + int _lcount = atomic_load(&lcount); + atomic_fetch_add((atomic_int *)chars_decoded, _lcount); + } +} diff --git a/tests/test-progs/gpu-hello/src/gpu-hello.cpp b/tests/test-progs/gpu-hello/src/gpu-hello.cpp new file mode 100755 index 000000000..b6fff6e32 --- /dev/null +++ b/tests/test-progs/gpu-hello/src/gpu-hello.cpp @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Marc Orr, Brad Beckmann + */ + +#include +#include + +#include +#include +#include +#include + +#define SUCCESS 0 +#define FAILURE 1 + +// OpenCL datastructures +cl_context context; +cl_device_id *devices; +cl_command_queue commandQueue; +cl_program program; +cl_kernel readKernel; + +// Application datastructures +const int CACHE_LINE_SIZE = 64; +size_t grid_size = 512; +size_t work_group_size = 256; + +// arguments +const int code_size = 5; +const char *code = "hello"; +int *keys; +char *msg; +int chars_decoded = 0; + +/* + Setup data structures for application/algorithm +*/ +int +setupDataStructs() +{ + msg = (char *)memalign(CACHE_LINE_SIZE, (grid_size + 1) * sizeof(char)); + if(msg == NULL) { + printf("%s:%d: error: %s\n", __FILE__, __LINE__, + "could not allocate host buffers\n"); + exit(-1); + } + msg[grid_size] = '\0'; + + keys = (int *)memalign(CACHE_LINE_SIZE, code_size * sizeof(int)); + keys[0] = 23; + keys[1] = 0; + keys[2] = 0; + keys[3] = 0; + keys[4] = 0; + + return SUCCESS; +} + +/* Setup OpenCL data structures */ +int +setupOpenCL() +{ + cl_int status = 0; + size_t deviceListSize; + + // 1. Get platform + cl_uint numPlatforms; + cl_platform_id platform = NULL; + status = clGetPlatformIDs(0, NULL, &numPlatforms); + if (status != CL_SUCCESS) { + printf("Error: Getting Platforms. (clGetPlatformsIDs)\n"); + return FAILURE; + } + + if (numPlatforms > 0) { + cl_platform_id *platforms = new cl_platform_id[numPlatforms]; + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + if (status != CL_SUCCESS) { + printf("Error: Getting Platform Ids. (clGetPlatformsIDs)\n"); + return FAILURE; + } + for (int i = 0; i < numPlatforms; ++i) { + char pbuff[100]; + status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, + sizeof(pbuff), pbuff, NULL); + if (status != CL_SUCCESS) { + printf("Error: Getting Platform Info.(clGetPlatformInfo)\n"); + return FAILURE; + } + platform = platforms[i]; + if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { + break; + } + } + delete platforms; + } + + if(NULL == platform) { + printf("NULL platform found so Exiting Application.\n"); + return FAILURE; + } + + // 2. create context from platform + cl_context_properties cps[3] = + {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; + context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, + &status); + if (status != CL_SUCCESS) { + printf("Error: Creating Context. (clCreateContextFromType)\n"); + return FAILURE; + } + + // 3. Get device info + // 3a. Get # of devices + status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, + &deviceListSize); + if (status != CL_SUCCESS) { + printf("Error: Getting Context Info (1st clGetContextInfo)\n"); + return FAILURE; + } + + // 3b. Get the device list data + devices = (cl_device_id *)malloc(deviceListSize); + if (devices == 0) { + printf("Error: No devices found.\n"); + return FAILURE; + } + status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, + devices, NULL); + if (status != CL_SUCCESS) { + printf("Error: Getting Context Info (2nd clGetContextInfo)\n"); + return FAILURE; + } + + // 4. Create command queue for device + commandQueue = clCreateCommandQueue(context, devices[0], 0, &status); + if (status != CL_SUCCESS) { + printf("Creating Command Queue. (clCreateCommandQueue)\n"); + return FAILURE; + } + + const char *source = "dummy text"; + + size_t sourceSize[] = {strlen(source)}; + + // 5b. Register the kernel with the runtime + program = clCreateProgramWithSource(context, 1, &source, sourceSize, + &status); + if (status != CL_SUCCESS) { + printf("Error: Loading kernel (clCreateProgramWithSource)\n"); + return FAILURE; + } + + status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); + if (status != CL_SUCCESS) { + printf("Error: Building kernel (clBuildProgram)\n"); + return FAILURE; + } + + readKernel = clCreateKernel(program, "read_kernel", &status); + if (status != CL_SUCCESS) { + printf("Error: Creating readKernel from program. (clCreateKernel)\n"); + return FAILURE; + } + + return SUCCESS; +} + + +/* Run kernels */ +int +runCLKernel(cl_kernel kernel) +{ + cl_int status; + cl_event event; + size_t globalThreads[1] = {grid_size}; + size_t localThreads[1] = {work_group_size}; + + // 1. Set arguments + // 1a. code size + size_t code_size = strlen(code); + status = clSetKernelArg(kernel, 0, sizeof(size_t), &code_size); + if (status != CL_SUCCESS) { + printf("Error: Setting kernel argument. (code_size)\n"); + return FAILURE; + } + + // 1b. code + status = clSetKernelArg(kernel, 1, sizeof(char *), (void *)&code); + if (status != CL_SUCCESS) { + printf("Error: Setting kernel argument. (code_in)\n"); + return FAILURE; + } + + // 1c. keys + printf("keys = %p, &keys = %p, keys[0] = %d\n", keys, &keys, keys[0]); + status = clSetKernelArg(kernel, 2, sizeof(int *), (void *)&keys); + if (status != CL_SUCCESS) { + printf("Error: Setting kernel argument. (key_arr)\n"); + return FAILURE; + } + + // 1d. msg + status = clSetKernelArg(kernel, 3, sizeof(char *), (void *)&msg); + if (status != CL_SUCCESS) { + printf("Error: Setting kernel argument. (memOut)\n"); + return FAILURE; + } + + // 1e. chars_decoded + int *chars_decoded_ptr = &chars_decoded; + status = clSetKernelArg(kernel, 4, sizeof(int *), + (void *)&chars_decoded_ptr); + if (status != CL_SUCCESS) { + printf("Error: Setting kernel argument. (memOut)\n"); + return FAILURE; + } + + // 2. Launch kernel + status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, + globalThreads, localThreads, 0, NULL, + &event); + if (status != CL_SUCCESS) { + printf("Error: Enqueue failed. (clEnqueueNDRangeKernel)\n"); + return FAILURE; + } + + // 3. Wait for the kernel + status = clWaitForEvents(1, &event); + if (status != CL_SUCCESS) { + printf("Error: Waiting for kernel run to finish. (clWaitForEvents)\n"); + return FAILURE; + } + + // 4. Cleanup + status = clReleaseEvent(event); + if (status != CL_SUCCESS) { + printf("Error: Release event object. (clReleaseEvent)\n"); + return FAILURE; + } + + return SUCCESS; +} + + +/* Release OpenCL resources (Context, Memory etc.) */ +int +cleanupCL() +{ + cl_int status; + status = clReleaseKernel(readKernel); + if (status != CL_SUCCESS) { + printf("Error: In clReleaseKernel \n"); + return FAILURE; + } + status = clReleaseProgram(program); + if (status != CL_SUCCESS) { + printf("Error: In clReleaseProgram\n"); + return FAILURE; + } + status = clReleaseCommandQueue(commandQueue); + if (status != CL_SUCCESS) { + printf("Error: In clReleaseCommandQueue\n"); + return FAILURE; + } + status = clReleaseContext(context); + if (status != CL_SUCCESS) { + printf("Error: In clReleaseContext\n"); + return FAILURE; + } + + return SUCCESS; +} + +int +main(int argc, char * argv[]) +{ + // Initialize Host application + if (setupDataStructs() != SUCCESS) { + return FAILURE; + } + + // Initialize OpenCL resources + if (setupOpenCL() != SUCCESS) { + return FAILURE; + } + + // Run the CL program + if (runCLKernel(readKernel) != SUCCESS) { + return FAILURE; + } + printf("the gpu says:\n"); + printf("%s\n", msg); + + // Releases OpenCL resources + if (cleanupCL()!= SUCCESS) { + return FAILURE; + } + + return SUCCESS; +} diff --git a/util/regress b/util/regress index 3cb078349..ceaaf739d 100755 --- a/util/regress +++ b/util/regress @@ -49,7 +49,8 @@ add_option('--builds', 'POWER,' \ 'SPARC,' \ 'X86,X86_MESI_Two_Level,' \ - 'ARM', + 'ARM,' \ + 'HSAIL_X86', help="comma-separated build targets to test (default: '%default')") add_option('--modes', default='se,fs',